xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 9371c9d470a9602b6d10a8bf50c9b2280a79e45a)
120c24465SJunchao Zhang #include "petsc/private/sfimpl.h"
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
57fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
67fd2d3dbSJunchao Zhang 
740e23c03SJunchao Zhang /*
840e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
940e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1040e23c03SJunchao Zhang  */
1140e23c03SJunchao Zhang 
12cd620004SJunchao Zhang #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d
1340e23c03SJunchao Zhang 
14cd620004SJunchao Zhang /* Operations working like s += t */
15*9371c9d4SSatish Balay #define OP_BINARY(op, s, t) \
16*9371c9d4SSatish Balay   do { (s) = (s)op(t); } while (0) /* binary ops in the middle such as +, *, && etc. */
17*9371c9d4SSatish Balay #define OP_FUNCTION(op, s, t) \
18*9371c9d4SSatish Balay   do { (s) = op((s), (t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */
19*9371c9d4SSatish Balay #define OP_LXOR(op, s, t) \
20*9371c9d4SSatish Balay   do { (s) = (!(s)) != (!(t)); } while (0) /* logical exclusive OR */
21*9371c9d4SSatish Balay #define OP_ASSIGN(op, s, t) \
22*9371c9d4SSatish Balay   do { (s) = (t); } while (0)
23cd620004SJunchao Zhang /* Ref MPI MAXLOC */
24cd620004SJunchao Zhang #define OP_XLOC(op, s, t) \
25cd620004SJunchao Zhang   do { \
26cd620004SJunchao Zhang     if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \
27cd620004SJunchao Zhang     else if (!((s).u op(t).u)) s = t; \
28cd620004SJunchao Zhang   } while (0)
2940e23c03SJunchao Zhang 
3040e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
3140e23c03SJunchao Zhang 
3240e23c03SJunchao Zhang    Arguments of the macro:
33b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
34fcc7397dSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bsz.
35b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
3640e23c03SJunchao Zhang 
3740e23c03SJunchao Zhang    Arguments of the Pack routine:
38cd620004SJunchao Zhang    +count     Number of indices in idx[].
39fcc7397dSJunchao Zhang    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
40fcc7397dSJunchao Zhang    .opt       Per-pack optimization plan. NULL means no such plan.
41fcc7397dSJunchao Zhang    .idx       Indices of entries to packed.
42eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
43cd620004SJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
44cd620004SJunchao Zhang    -packed    Address of the packed data.
4540e23c03SJunchao Zhang  */
46b23bfdefSJunchao Zhang #define DEF_PackFunc(Type, BS, EQ) \
47*9371c9d4SSatish Balay   static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) { \
48b23bfdefSJunchao Zhang     const Type    *u = (const Type *)unpacked, *u2; \
49b23bfdefSJunchao Zhang     Type          *p = (Type *)packed, *p2; \
50fcc7397dSJunchao Zhang     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
51fcc7397dSJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
52b23bfdefSJunchao Zhang     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
5340e23c03SJunchao Zhang     PetscFunctionBegin; \
549566063dSJacob Faibussowitsch     if (!idx) PetscCall(PetscArraycpy(p, u + start * MBS, MBS * count)); /* idx[] are contiguous */ \
55*9371c9d4SSatish Balay     else if (opt) { /* has optimizations available */ p2 = p; \
56fcc7397dSJunchao Zhang       for (r = 0; r < opt->n; r++) { \
57fcc7397dSJunchao Zhang         u2 = u + opt->start[r] * MBS; \
58fcc7397dSJunchao Zhang         X  = opt->X[r]; \
59fcc7397dSJunchao Zhang         Y  = opt->Y[r]; \
60fcc7397dSJunchao Zhang         for (k = 0; k < opt->dz[r]; k++) \
61fcc7397dSJunchao Zhang           for (j = 0; j < opt->dy[r]; j++) { \
629566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS)); \
63fcc7397dSJunchao Zhang             p2 += opt->dx[r] * MBS; \
64fcc7397dSJunchao Zhang           } \
65fcc7397dSJunchao Zhang       } \
66fcc7397dSJunchao Zhang     } else { \
67b23bfdefSJunchao Zhang       for (i = 0; i < count; i++) \
68eb02082bSJunchao Zhang         for (j = 0; j < M; j++)    /* Decent compilers should eliminate this loop when M = const 1 */ \
69eb02082bSJunchao Zhang           for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
70b23bfdefSJunchao Zhang             p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \
7140e23c03SJunchao Zhang     } \
7240e23c03SJunchao Zhang     PetscFunctionReturn(0); \
7340e23c03SJunchao Zhang   }
7440e23c03SJunchao Zhang 
75cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
76cd620004SJunchao Zhang                 and inserts into a sparse array.
7740e23c03SJunchao Zhang 
7840e23c03SJunchao Zhang    Arguments:
79b23bfdefSJunchao Zhang   .Type       Type of the data
8040e23c03SJunchao Zhang   .BS         Block size for vectorization
81b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
8240e23c03SJunchao Zhang 
8340e23c03SJunchao Zhang   Notes:
8440e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
8540e23c03SJunchao Zhang  */
86cd620004SJunchao Zhang #define DEF_UnpackFunc(Type, BS, EQ) \
87*9371c9d4SSatish Balay   static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) { \
88b23bfdefSJunchao Zhang     Type          *u = (Type *)unpacked, *u2; \
89fcc7397dSJunchao Zhang     const Type    *p = (const Type *)packed; \
90fcc7397dSJunchao Zhang     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
91fcc7397dSJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
92b23bfdefSJunchao Zhang     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
9340e23c03SJunchao Zhang     PetscFunctionBegin; \
94b23bfdefSJunchao Zhang     if (!idx) { \
95fcc7397dSJunchao Zhang       u += start * MBS; \
969566063dSJacob Faibussowitsch       if (u != p) PetscCall(PetscArraycpy(u, p, count *MBS)); \
97fcc7397dSJunchao Zhang     } else if (opt) { /* has optimizations available */ \
98fcc7397dSJunchao Zhang       for (r = 0; r < opt->n; r++) { \
99fcc7397dSJunchao Zhang         u2 = u + opt->start[r] * MBS; \
100fcc7397dSJunchao Zhang         X  = opt->X[r]; \
101fcc7397dSJunchao Zhang         Y  = opt->Y[r]; \
102fcc7397dSJunchao Zhang         for (k = 0; k < opt->dz[r]; k++) \
103fcc7397dSJunchao Zhang           for (j = 0; j < opt->dy[r]; j++) { \
1049566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS)); \
105fcc7397dSJunchao Zhang             p += opt->dx[r] * MBS; \
106fcc7397dSJunchao Zhang           } \
107fcc7397dSJunchao Zhang       } \
108fcc7397dSJunchao Zhang     } else { \
109b23bfdefSJunchao Zhang       for (i = 0; i < count; i++) \
110b23bfdefSJunchao Zhang         for (j = 0; j < M; j++) \
111cd620004SJunchao Zhang           for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \
11240e23c03SJunchao Zhang     } \
11340e23c03SJunchao Zhang     PetscFunctionReturn(0); \
11440e23c03SJunchao Zhang   }
11540e23c03SJunchao Zhang 
116cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
11740e23c03SJunchao Zhang 
11840e23c03SJunchao Zhang    Arguments:
119cd620004SJunchao Zhang   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
120b23bfdefSJunchao Zhang   .Type       Type of the data
12140e23c03SJunchao Zhang   .BS         Block size for vectorization
122b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
123cd620004SJunchao Zhang   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
124cd620004SJunchao Zhang   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
12540e23c03SJunchao Zhang  */
126cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \
127*9371c9d4SSatish Balay   static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) { \
128cd620004SJunchao Zhang     Type          *u = (Type *)unpacked, *u2; \
129fcc7397dSJunchao Zhang     const Type    *p = (const Type *)packed; \
130fcc7397dSJunchao Zhang     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
131fcc7397dSJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
132b23bfdefSJunchao Zhang     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
13340e23c03SJunchao Zhang     PetscFunctionBegin; \
134b23bfdefSJunchao Zhang     if (!idx) { \
135fcc7397dSJunchao Zhang       u += start * MBS; \
136cd620004SJunchao Zhang       for (i = 0; i < count; i++) \
137cd620004SJunchao Zhang         for (j = 0; j < M; j++) \
138*9371c9d4SSatish Balay           for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
139fcc7397dSJunchao Zhang     } else if (opt) { /* idx[] has patterns */ \
140fcc7397dSJunchao Zhang       for (r = 0; r < opt->n; r++) { \
141fcc7397dSJunchao Zhang         u2 = u + opt->start[r] * MBS; \
142fcc7397dSJunchao Zhang         X  = opt->X[r]; \
143fcc7397dSJunchao Zhang         Y  = opt->Y[r]; \
144fcc7397dSJunchao Zhang         for (k = 0; k < opt->dz[r]; k++) \
145fcc7397dSJunchao Zhang           for (j = 0; j < opt->dy[r]; j++) { \
146fcc7397dSJunchao Zhang             for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \
147fcc7397dSJunchao Zhang             p += opt->dx[r] * MBS; \
148fcc7397dSJunchao Zhang           } \
149fcc7397dSJunchao Zhang       } \
150fcc7397dSJunchao Zhang     } else { \
151cd620004SJunchao Zhang       for (i = 0; i < count; i++) \
152cd620004SJunchao Zhang         for (j = 0; j < M; j++) \
153*9371c9d4SSatish Balay           for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
154cd620004SJunchao Zhang     } \
155cd620004SJunchao Zhang     PetscFunctionReturn(0); \
156cd620004SJunchao Zhang   }
157cd620004SJunchao Zhang 
158cd620004SJunchao Zhang #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \
159*9371c9d4SSatish Balay   static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) { \
160fcc7397dSJunchao Zhang     Type          *u = (Type *)unpacked, *p = (Type *)packed, tmp; \
161fcc7397dSJunchao Zhang     PetscInt       i, j, k, r, l, bs = link->bs; \
162fcc7397dSJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; \
163fcc7397dSJunchao Zhang     const PetscInt MBS = M * BS; \
164cd620004SJunchao Zhang     PetscFunctionBegin; \
165fcc7397dSJunchao Zhang     for (i = 0; i < count; i++) { \
166fcc7397dSJunchao Zhang       r = (!idx ? start + i : idx[i]) * MBS; \
167fcc7397dSJunchao Zhang       l = i * MBS; \
168b23bfdefSJunchao Zhang       for (j = 0; j < M; j++) \
169b23bfdefSJunchao Zhang         for (k = 0; k < BS; k++) { \
170fcc7397dSJunchao Zhang           tmp = u[r + j * BS + k]; \
171fcc7397dSJunchao Zhang           OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \
172fcc7397dSJunchao Zhang           p[l + j * BS + k] = tmp; \
173cd620004SJunchao Zhang         } \
174cd620004SJunchao Zhang     } \
175cd620004SJunchao Zhang     PetscFunctionReturn(0); \
176cd620004SJunchao Zhang   }
177cd620004SJunchao Zhang 
178cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \
179*9371c9d4SSatish Balay   static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) { \
180fcc7397dSJunchao Zhang     const Type    *u = (const Type *)src; \
181fcc7397dSJunchao Zhang     Type          *v = (Type *)dst; \
182fcc7397dSJunchao Zhang     PetscInt       i, j, k, s, t, X, Y, bs = link->bs; \
183cd620004SJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; \
184cd620004SJunchao Zhang     const PetscInt MBS = M * BS; \
185cd620004SJunchao Zhang     PetscFunctionBegin; \
186fcc7397dSJunchao Zhang     if (!srcIdx) { /* src is contiguous */ \
187fcc7397dSJunchao Zhang       u += srcStart * MBS; \
1889566063dSJacob Faibussowitsch       PetscCall(CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u)); \
189fcc7397dSJunchao Zhang     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
190fcc7397dSJunchao Zhang       u += srcOpt->start[0] * MBS; \
191fcc7397dSJunchao Zhang       v += dstStart * MBS; \
192*9371c9d4SSatish Balay       X = srcOpt->X[0]; \
193*9371c9d4SSatish Balay       Y = srcOpt->Y[0]; \
194fcc7397dSJunchao Zhang       for (k = 0; k < srcOpt->dz[0]; k++) \
195fcc7397dSJunchao Zhang         for (j = 0; j < srcOpt->dy[0]; j++) { \
196fcc7397dSJunchao Zhang           for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \
197fcc7397dSJunchao Zhang           v += srcOpt->dx[0] * MBS; \
198fcc7397dSJunchao Zhang         } \
199fcc7397dSJunchao Zhang     } else { /* all other cases */ \
200fcc7397dSJunchao Zhang       for (i = 0; i < count; i++) { \
201fcc7397dSJunchao Zhang         s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \
202fcc7397dSJunchao Zhang         t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \
203cd620004SJunchao Zhang         for (j = 0; j < M; j++) \
204fcc7397dSJunchao Zhang           for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \
205fcc7397dSJunchao Zhang       } \
206cd620004SJunchao Zhang     } \
207cd620004SJunchao Zhang     PetscFunctionReturn(0); \
208cd620004SJunchao Zhang   }
209cd620004SJunchao Zhang 
210cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \
211*9371c9d4SSatish Balay   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) { \
212fcc7397dSJunchao Zhang     Type          *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \
213fcc7397dSJunchao Zhang     const Type    *ldata = (const Type *)leafdata; \
214fcc7397dSJunchao Zhang     PetscInt       i, j, k, r, l, bs = link->bs; \
215cd620004SJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; \
216cd620004SJunchao Zhang     const PetscInt MBS = M * BS; \
217cd620004SJunchao Zhang     PetscFunctionBegin; \
218fcc7397dSJunchao Zhang     for (i = 0; i < count; i++) { \
219fcc7397dSJunchao Zhang       r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \
220fcc7397dSJunchao Zhang       l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \
221cd620004SJunchao Zhang       for (j = 0; j < M; j++) \
222cd620004SJunchao Zhang         for (k = 0; k < BS; k++) { \
223fcc7397dSJunchao Zhang           lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \
224fcc7397dSJunchao Zhang           OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \
22540e23c03SJunchao Zhang         } \
22640e23c03SJunchao Zhang     } \
22740e23c03SJunchao Zhang     PetscFunctionReturn(0); \
22840e23c03SJunchao Zhang   }
22940e23c03SJunchao Zhang 
230b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
231b23bfdefSJunchao Zhang #define DEF_Pack(Type, BS, EQ) \
232*9371c9d4SSatish Balay   DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) { \
233eb02082bSJunchao Zhang     link->h_Pack             = CPPJoin4(Pack, Type, BS, EQ); \
234eb02082bSJunchao Zhang     link->h_UnpackAndInsert  = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \
235cd620004SJunchao Zhang     link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \
23640e23c03SJunchao Zhang   }
23740e23c03SJunchao Zhang 
238b23bfdefSJunchao Zhang /* Add, Mult ops */
239b23bfdefSJunchao Zhang #define DEF_Add(Type, BS, EQ) \
240*9371c9d4SSatish Balay   DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) { \
241eb02082bSJunchao Zhang     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \
242eb02082bSJunchao Zhang     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult, Type, BS, EQ); \
243eb02082bSJunchao Zhang     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd, Type, BS, EQ); \
244cd620004SJunchao Zhang     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \
245cd620004SJunchao Zhang     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult, Type, BS, EQ); \
246cd620004SJunchao Zhang     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \
24740e23c03SJunchao Zhang   }
24840e23c03SJunchao Zhang 
249b23bfdefSJunchao Zhang /* Max, Min ops */
250b23bfdefSJunchao Zhang #define DEF_Cmp(Type, BS, EQ) \
251*9371c9d4SSatish Balay   DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) { \
252eb02082bSJunchao Zhang     link->h_UnpackAndMax  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
253eb02082bSJunchao Zhang     link->h_UnpackAndMin  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
254cd620004SJunchao Zhang     link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
255cd620004SJunchao Zhang     link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
256b23bfdefSJunchao Zhang   }
257b23bfdefSJunchao Zhang 
258b23bfdefSJunchao Zhang /* Logical ops.
259cd620004SJunchao Zhang   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
26040e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
26140e23c03SJunchao Zhang  */
262b23bfdefSJunchao Zhang #define DEF_Log(Type, BS, EQ) \
263*9371c9d4SSatish Balay   DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) { \
264eb02082bSJunchao Zhang     link->h_UnpackAndLAND  = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \
265eb02082bSJunchao Zhang     link->h_UnpackAndLOR   = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \
266eb02082bSJunchao Zhang     link->h_UnpackAndLXOR  = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \
267cd620004SJunchao Zhang     link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \
268cd620004SJunchao Zhang     link->h_ScatterAndLOR  = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \
269cd620004SJunchao Zhang     link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \
27040e23c03SJunchao Zhang   }
27140e23c03SJunchao Zhang 
272b23bfdefSJunchao Zhang /* Bitwise ops */
273b23bfdefSJunchao Zhang #define DEF_Bit(Type, BS, EQ) \
274*9371c9d4SSatish Balay   DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) { \
275eb02082bSJunchao Zhang     link->h_UnpackAndBAND  = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \
276eb02082bSJunchao Zhang     link->h_UnpackAndBOR   = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \
277eb02082bSJunchao Zhang     link->h_UnpackAndBXOR  = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \
278cd620004SJunchao Zhang     link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \
279cd620004SJunchao Zhang     link->h_ScatterAndBOR  = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \
280cd620004SJunchao Zhang     link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \
28140e23c03SJunchao Zhang   }
28240e23c03SJunchao Zhang 
283cd620004SJunchao Zhang /* Maxloc, Minloc ops */
284cd620004SJunchao Zhang #define DEF_Xloc(Type, BS, EQ) \
285*9371c9d4SSatish Balay   DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) { \
286cd620004SJunchao Zhang     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
287cd620004SJunchao Zhang     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
288cd620004SJunchao Zhang     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
289cd620004SJunchao Zhang     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
29040e23c03SJunchao Zhang   }
29140e23c03SJunchao Zhang 
292b23bfdefSJunchao Zhang #define DEF_IntegerType(Type, BS, EQ) \
293*9371c9d4SSatish Balay   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) { \
294b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
295b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
296b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
297b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \
298b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \
29940e23c03SJunchao Zhang   }
30040e23c03SJunchao Zhang 
301b23bfdefSJunchao Zhang #define DEF_RealType(Type, BS, EQ) \
302*9371c9d4SSatish Balay   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) { \
303b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
304b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
305b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
306b23bfdefSJunchao Zhang   }
30740e23c03SJunchao Zhang 
30840e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
309b23bfdefSJunchao Zhang #define DEF_ComplexType(Type, BS, EQ) \
310*9371c9d4SSatish Balay   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) { \
311b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
312b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
313b23bfdefSJunchao Zhang   }
31440e23c03SJunchao Zhang #endif
315b23bfdefSJunchao Zhang 
316b23bfdefSJunchao Zhang #define DEF_DumbType(Type, BS, EQ) \
317*9371c9d4SSatish Balay   DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) { \
318b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
319b23bfdefSJunchao Zhang   }
320b23bfdefSJunchao Zhang 
321b23bfdefSJunchao Zhang /* Maxloc, Minloc */
322cd620004SJunchao Zhang #define DEF_PairType(Type, BS, EQ) \
323*9371c9d4SSatish Balay   DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) { \
324cd620004SJunchao Zhang     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
325cd620004SJunchao Zhang     CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \
326b23bfdefSJunchao Zhang   }
327b23bfdefSJunchao Zhang 
328b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 1, 1)   /* unit = 1 MPIU_INT  */
329b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */
330b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */
331b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */
332b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */
333b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */
334b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */
335b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
336b23bfdefSJunchao Zhang 
337b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
338*9371c9d4SSatish Balay   DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0)
339b23bfdefSJunchao Zhang #endif
340b23bfdefSJunchao Zhang 
341b23bfdefSJunchao Zhang   /* The typedefs are used to get a typename without space that CPPJoin can handle */
342b23bfdefSJunchao Zhang   typedef signed char SignedChar;
343*9371c9d4SSatish Balay DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0)
344b23bfdefSJunchao Zhang 
345b23bfdefSJunchao Zhang   typedef unsigned char UnsignedChar;
346*9371c9d4SSatish Balay DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0)
347b23bfdefSJunchao Zhang 
348*9371c9d4SSatish Balay   DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0)
349b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
350*9371c9d4SSatish Balay     DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0)
351b23bfdefSJunchao Zhang #endif
352b23bfdefSJunchao Zhang 
353cd620004SJunchao Zhang #define PairType(Type1, Type2) Type1##_##Type2
354*9371c9d4SSatish Balay       typedef struct {
355*9371c9d4SSatish Balay   int u;
356*9371c9d4SSatish Balay   int i;
357*9371c9d4SSatish Balay } PairType(int, int);
358*9371c9d4SSatish Balay typedef struct {
359*9371c9d4SSatish Balay   PetscInt u;
360*9371c9d4SSatish Balay   PetscInt i;
361*9371c9d4SSatish Balay } PairType(PetscInt, PetscInt);
362*9371c9d4SSatish Balay DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1)
363b23bfdefSJunchao Zhang 
364b23bfdefSJunchao Zhang   /* If we don't know the basic type, we treat it as a stream of chars or ints */
365*9371c9d4SSatish Balay   DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0)
366b23bfdefSJunchao Zhang 
367eb02082bSJunchao Zhang     typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
368*9371c9d4SSatish Balay DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0)
36940e23c03SJunchao Zhang 
370*9371c9d4SSatish Balay   PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link) {
371cd620004SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
37271438e86SJunchao Zhang   PetscInt       i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8;
37371438e86SJunchao Zhang 
37471438e86SJunchao Zhang   PetscFunctionBegin;
37571438e86SJunchao Zhang   /* Destroy device-specific fields */
3769566063dSJacob Faibussowitsch   if (link->deviceinited) PetscCall((*link->Destroy)(sf, link));
37771438e86SJunchao Zhang 
37871438e86SJunchao Zhang   /* Destroy host related fields */
3799566063dSJacob Faibussowitsch   if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit));
38071438e86SJunchao Zhang   if (!link->use_nvshmem) {
38171438e86SJunchao Zhang     for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */
3829566063dSJacob Faibussowitsch       if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i]));
38371438e86SJunchao Zhang     }
3849566063dSJacob Faibussowitsch     PetscCall(PetscFree(link->reqs));
38571438e86SJunchao Zhang     for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
3869566063dSJacob Faibussowitsch       PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]));
3879566063dSJacob Faibussowitsch       PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]));
38871438e86SJunchao Zhang     }
38971438e86SJunchao Zhang   }
3909566063dSJacob Faibussowitsch   PetscCall(PetscFree(link));
39171438e86SJunchao Zhang   PetscFunctionReturn(0);
39271438e86SJunchao Zhang }
39371438e86SJunchao Zhang 
394*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) {
395cd620004SJunchao Zhang   PetscFunctionBegin;
3969566063dSJacob Faibussowitsch   PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata));
39771438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM)
39871438e86SJunchao Zhang   {
39971438e86SJunchao Zhang     PetscBool use_nvshmem;
4009566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem));
40171438e86SJunchao Zhang     if (use_nvshmem) {
4029566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
40371438e86SJunchao Zhang       PetscFunctionReturn(0);
404cd620004SJunchao Zhang     }
405cd620004SJunchao Zhang   }
4067fd2d3dbSJunchao Zhang #endif
4079566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
408cd620004SJunchao Zhang   PetscFunctionReturn(0);
409cd620004SJunchao Zhang }
410cd620004SJunchao Zhang 
411cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
412cd620004SJunchao Zhang    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
413cd620004SJunchao Zhang */
414*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs) {
415cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
416c87b50c4SJunchao Zhang   PetscInt           i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks;
417cd620004SJunchao Zhang   const PetscInt    *rootoffset, *leafoffset;
418cd620004SJunchao Zhang   MPI_Aint           disp;
419cd620004SJunchao Zhang   MPI_Comm           comm          = PetscObjectComm((PetscObject)sf);
420cd620004SJunchao Zhang   MPI_Datatype       unit          = link->unit;
421cd620004SJunchao Zhang   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
422cd620004SJunchao Zhang   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
423cd620004SJunchao Zhang 
424cd620004SJunchao Zhang   PetscFunctionBegin;
425cd620004SJunchao Zhang   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
426cd620004SJunchao Zhang   if (sf->persistent) {
427cd620004SJunchao Zhang     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
4289566063dSJacob Faibussowitsch       PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
429cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
430cd620004SJunchao Zhang         for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
431cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
432c87b50c4SJunchao Zhang           cnt  = rootoffset[i + 1] - rootoffset[i];
4339566063dSJacob Faibussowitsch           PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
434cd620004SJunchao Zhang         }
435cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
436cd620004SJunchao Zhang         for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
437cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
438c87b50c4SJunchao Zhang           cnt  = rootoffset[i + 1] - rootoffset[i];
4399566063dSJacob Faibussowitsch           PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
440cd620004SJunchao Zhang         }
441cd620004SJunchao Zhang       }
442cd620004SJunchao Zhang       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
443cd620004SJunchao Zhang     }
444cd620004SJunchao Zhang 
445cd620004SJunchao Zhang     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
4469566063dSJacob Faibussowitsch       PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
447cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
448cd620004SJunchao Zhang         for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
449cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
450c87b50c4SJunchao Zhang           cnt  = leafoffset[i + 1] - leafoffset[i];
4519566063dSJacob Faibussowitsch           PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
452cd620004SJunchao Zhang         }
453cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
454cd620004SJunchao Zhang         for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
455cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
456c87b50c4SJunchao Zhang           cnt  = leafoffset[i + 1] - leafoffset[i];
4579566063dSJacob Faibussowitsch           PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
458cd620004SJunchao Zhang         }
459cd620004SJunchao Zhang       }
460cd620004SJunchao Zhang       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
461cd620004SJunchao Zhang     }
462cd620004SJunchao Zhang   }
463cd620004SJunchao Zhang   if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
464cd620004SJunchao Zhang   if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
465cd620004SJunchao Zhang   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
466cd620004SJunchao Zhang   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
467cd620004SJunchao Zhang   PetscFunctionReturn(0);
468cd620004SJunchao Zhang }
469cd620004SJunchao Zhang 
470*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink) {
471cd620004SJunchao Zhang   PetscSFLink    link, *p;
47240e23c03SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
47340e23c03SJunchao Zhang 
47440e23c03SJunchao Zhang   PetscFunctionBegin;
47540e23c03SJunchao Zhang   /* Look for types in cache */
47640e23c03SJunchao Zhang   for (p = &bas->inuse; (link = *p); p = &link->next) {
47740e23c03SJunchao Zhang     PetscBool match;
4789566063dSJacob Faibussowitsch     PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
479637e6665SJunchao Zhang     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
48040e23c03SJunchao Zhang       switch (cmode) {
48140e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
48240e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
48340e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode");
48440e23c03SJunchao Zhang       }
48540e23c03SJunchao Zhang       *mylink = link;
48640e23c03SJunchao Zhang       PetscFunctionReturn(0);
48740e23c03SJunchao Zhang     }
48840e23c03SJunchao Zhang   }
48940e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack");
49040e23c03SJunchao Zhang }
49140e23c03SJunchao Zhang 
492*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink) {
49340e23c03SJunchao Zhang   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
49471438e86SJunchao Zhang   PetscSFLink    link = *mylink;
49540e23c03SJunchao Zhang 
49640e23c03SJunchao Zhang   PetscFunctionBegin;
49771438e86SJunchao Zhang   link->rootdata = NULL;
49871438e86SJunchao Zhang   link->leafdata = NULL;
49971438e86SJunchao Zhang   link->next     = bas->avail;
50071438e86SJunchao Zhang   bas->avail     = link;
50171438e86SJunchao Zhang   *mylink        = NULL;
502eb02082bSJunchao Zhang   PetscFunctionReturn(0);
503eb02082bSJunchao Zhang }
504eb02082bSJunchao Zhang 
5059d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
506*9371c9d4SSatish Balay PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata) {
507cd620004SJunchao Zhang   PetscSFLink    link, *p;
5089d1c8addSJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
5099d1c8addSJunchao Zhang   PetscBool      match;
5109d1c8addSJunchao Zhang 
5119d1c8addSJunchao Zhang   PetscFunctionBegin;
512b458e8f1SJose E. Roman   if (PetscDefined(USE_DEBUG)) {
51318fb5014SJunchao Zhang     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
51418fb5014SJunchao Zhang        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
51518fb5014SJunchao Zhang     */
516637e6665SJunchao Zhang     if (rootdata || leafdata) {
5179d1c8addSJunchao Zhang       for (p = &bas->inuse; (link = *p); p = &link->next) {
5189566063dSJacob Faibussowitsch         PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
519c9cc58a2SBarry Smith         PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata, PETSC_COMM_SELF, PETSC_ERR_SUP, "Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.", rootdata, leafdata);
5209d1c8addSJunchao Zhang       }
52118fb5014SJunchao Zhang     }
522b458e8f1SJose E. Roman   }
5239d1c8addSJunchao Zhang   PetscFunctionReturn(0);
5249d1c8addSJunchao Zhang }
5259d1c8addSJunchao Zhang 
526*9371c9d4SSatish Balay static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n) {
52720c24465SJunchao Zhang   PetscFunctionBegin;
5289566063dSJacob Faibussowitsch   if (n) PetscCall(PetscMemcpy(dst, src, n));
52920c24465SJunchao Zhang   PetscFunctionReturn(0);
53020c24465SJunchao Zhang }
53120c24465SJunchao Zhang 
532*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit) {
533b23bfdefSJunchao Zhang   PetscInt    nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
534b23bfdefSJunchao Zhang   PetscBool   is2Int, is2PetscInt;
53540e23c03SJunchao Zhang   PetscMPIInt ni, na, nd, combiner;
53640e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
537b23bfdefSJunchao Zhang   PetscInt nPetscComplex = 0;
53840e23c03SJunchao Zhang #endif
53940e23c03SJunchao Zhang 
54040e23c03SJunchao Zhang   PetscFunctionBegin;
5419566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar));
5429566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar));
543b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
5449566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt));
5459566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt));
5469566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal));
54740e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
5489566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex));
54940e23c03SJunchao Zhang #endif
5509566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int));
5519566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt));
552b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
5539566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Type_get_envelope(unit, &ni, &na, &nd, &combiner));
5545ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
555b23bfdefSJunchao Zhang   link->bs        = 1;                                                           /* default */
55640e23c03SJunchao Zhang 
557eb02082bSJunchao Zhang   if (is2Int) {
558cd620004SJunchao Zhang     PackInit_PairType_int_int_1_1(link);
559eb02082bSJunchao Zhang     link->bs        = 1;
560eb02082bSJunchao Zhang     link->unitbytes = 2 * sizeof(int);
5615ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
562eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
5635ad15460SJunchao Zhang     link->unit      = MPI_2INT;
564eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
565cd620004SJunchao Zhang     PackInit_PairType_PetscInt_PetscInt_1_1(link);
566eb02082bSJunchao Zhang     link->bs        = 1;
567eb02082bSJunchao Zhang     link->unitbytes = 2 * sizeof(PetscInt);
568eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
5695ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
5705ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
571eb02082bSJunchao Zhang   } else if (nPetscReal) {
572*9371c9d4SSatish Balay     if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link);
573*9371c9d4SSatish Balay     else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link);
574*9371c9d4SSatish Balay     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link);
575*9371c9d4SSatish Balay     else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link);
576*9371c9d4SSatish Balay     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link);
577*9371c9d4SSatish Balay     else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link);
578*9371c9d4SSatish Balay     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link);
579*9371c9d4SSatish Balay     else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link);
580b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
581eb02082bSJunchao Zhang     link->unitbytes = nPetscReal * sizeof(PetscReal);
58240e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
583*9371c9d4SSatish Balay     if (link->bs == 1) {
584*9371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
585*9371c9d4SSatish Balay       link->unit      = MPIU_REAL;
586*9371c9d4SSatish Balay     }
587b23bfdefSJunchao Zhang   } else if (nPetscInt) {
588*9371c9d4SSatish Balay     if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link);
589*9371c9d4SSatish Balay     else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
590*9371c9d4SSatish Balay     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link);
591*9371c9d4SSatish Balay     else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
592*9371c9d4SSatish Balay     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link);
593*9371c9d4SSatish Balay     else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
594*9371c9d4SSatish Balay     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link);
595*9371c9d4SSatish Balay     else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
596b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
597eb02082bSJunchao Zhang     link->unitbytes = nPetscInt * sizeof(PetscInt);
598b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
599*9371c9d4SSatish Balay     if (link->bs == 1) {
600*9371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
601*9371c9d4SSatish Balay       link->unit      = MPIU_INT;
602*9371c9d4SSatish Balay     }
603b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
604b23bfdefSJunchao Zhang   } else if (nInt) {
605*9371c9d4SSatish Balay     if (nInt == 8) PackInit_IntegerType_int_8_1(link);
606*9371c9d4SSatish Balay     else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link);
607*9371c9d4SSatish Balay     else if (nInt == 4) PackInit_IntegerType_int_4_1(link);
608*9371c9d4SSatish Balay     else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link);
609*9371c9d4SSatish Balay     else if (nInt == 2) PackInit_IntegerType_int_2_1(link);
610*9371c9d4SSatish Balay     else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link);
611*9371c9d4SSatish Balay     else if (nInt == 1) PackInit_IntegerType_int_1_1(link);
612*9371c9d4SSatish Balay     else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link);
613b23bfdefSJunchao Zhang     link->bs        = nInt;
614eb02082bSJunchao Zhang     link->unitbytes = nInt * sizeof(int);
615b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
616*9371c9d4SSatish Balay     if (link->bs == 1) {
617*9371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
618*9371c9d4SSatish Balay       link->unit      = MPI_INT;
619*9371c9d4SSatish Balay     }
620b23bfdefSJunchao Zhang #endif
621b23bfdefSJunchao Zhang   } else if (nSignedChar) {
622*9371c9d4SSatish Balay     if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link);
623*9371c9d4SSatish Balay     else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
624*9371c9d4SSatish Balay     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link);
625*9371c9d4SSatish Balay     else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
626*9371c9d4SSatish Balay     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link);
627*9371c9d4SSatish Balay     else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
628*9371c9d4SSatish Balay     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link);
629*9371c9d4SSatish Balay     else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
630b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
631eb02082bSJunchao Zhang     link->unitbytes = nSignedChar * sizeof(SignedChar);
632b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
633*9371c9d4SSatish Balay     if (link->bs == 1) {
634*9371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
635*9371c9d4SSatish Balay       link->unit      = MPI_SIGNED_CHAR;
636*9371c9d4SSatish Balay     }
637b23bfdefSJunchao Zhang   } else if (nUnsignedChar) {
638*9371c9d4SSatish Balay     if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link);
639*9371c9d4SSatish Balay     else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
640*9371c9d4SSatish Balay     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link);
641*9371c9d4SSatish Balay     else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
642*9371c9d4SSatish Balay     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link);
643*9371c9d4SSatish Balay     else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
644*9371c9d4SSatish Balay     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link);
645*9371c9d4SSatish Balay     else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
646b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
647eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar * sizeof(UnsignedChar);
648b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
649*9371c9d4SSatish Balay     if (link->bs == 1) {
650*9371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
651*9371c9d4SSatish Balay       link->unit      = MPI_UNSIGNED_CHAR;
652*9371c9d4SSatish Balay     }
65340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
654b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
655*9371c9d4SSatish Balay     if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link);
656*9371c9d4SSatish Balay     else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
657*9371c9d4SSatish Balay     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link);
658*9371c9d4SSatish Balay     else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
659*9371c9d4SSatish Balay     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link);
660*9371c9d4SSatish Balay     else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
661*9371c9d4SSatish Balay     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link);
662*9371c9d4SSatish Balay     else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
663b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
664eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex * sizeof(PetscComplex);
66540e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
666*9371c9d4SSatish Balay     if (link->bs == 1) {
667*9371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
668*9371c9d4SSatish Balay       link->unit      = MPIU_COMPLEX;
669*9371c9d4SSatish Balay     }
67040e23c03SJunchao Zhang #endif
67140e23c03SJunchao Zhang   } else {
672b23bfdefSJunchao Zhang     MPI_Aint lb, nbyte;
6739566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte));
67408401ef6SPierre Jolivet     PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
675eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
676*9371c9d4SSatish Balay       if (nbyte == 4) PackInit_DumbType_char_4_1(link);
677*9371c9d4SSatish Balay       else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link);
678*9371c9d4SSatish Balay       else if (nbyte == 2) PackInit_DumbType_char_2_1(link);
679*9371c9d4SSatish Balay       else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link);
680*9371c9d4SSatish Balay       else if (nbyte == 1) PackInit_DumbType_char_1_1(link);
681*9371c9d4SSatish Balay       else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link);
682eb02082bSJunchao Zhang       link->bs        = nbyte;
683b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
684b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
68540e23c03SJunchao Zhang     } else {
686eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
687*9371c9d4SSatish Balay       if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link);
688*9371c9d4SSatish Balay       else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link);
689*9371c9d4SSatish Balay       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link);
690*9371c9d4SSatish Balay       else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link);
691*9371c9d4SSatish Balay       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link);
692*9371c9d4SSatish Balay       else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link);
693*9371c9d4SSatish Balay       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link);
694*9371c9d4SSatish Balay       else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link);
695eb02082bSJunchao Zhang       link->bs        = nInt;
696b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
69740e23c03SJunchao Zhang       link->basicunit = MPI_INT;
69840e23c03SJunchao Zhang     }
6995ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
70040e23c03SJunchao Zhang   }
701b23bfdefSJunchao Zhang 
7029566063dSJacob Faibussowitsch   if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit, &link->unit));
70320c24465SJunchao Zhang 
70420c24465SJunchao Zhang   link->Memcpy = PetscSFLinkMemcpy_Host;
70540e23c03SJunchao Zhang   PetscFunctionReturn(0);
70640e23c03SJunchao Zhang }
70740e23c03SJunchao Zhang 
708*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *)) {
70940e23c03SJunchao Zhang   PetscFunctionBegin;
71040e23c03SJunchao Zhang   *UnpackAndOp = NULL;
71171438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {
71283df288dSJunchao Zhang     if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
713eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
714eb02082bSJunchao Zhang     else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
715eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
716eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
717eb02082bSJunchao Zhang     else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
718eb02082bSJunchao Zhang     else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
719eb02082bSJunchao Zhang     else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
720eb02082bSJunchao Zhang     else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
721eb02082bSJunchao Zhang     else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
722eb02082bSJunchao Zhang     else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
723eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
724eb02082bSJunchao Zhang     else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
725eb02082bSJunchao Zhang   }
7267fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
72771438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) {
72883df288dSJunchao Zhang     if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
729eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
730eb02082bSJunchao Zhang     else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
731eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
732eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
733eb02082bSJunchao Zhang     else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
734eb02082bSJunchao Zhang     else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
735eb02082bSJunchao Zhang     else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
736eb02082bSJunchao Zhang     else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
737eb02082bSJunchao Zhang     else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
738eb02082bSJunchao Zhang     else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
739eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
740eb02082bSJunchao Zhang     else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
74171438e86SJunchao Zhang   } else if (PetscMemTypeDevice(mtype) && atomic) {
74283df288dSJunchao Zhang     if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
743eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
744eb02082bSJunchao Zhang     else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
745eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
746eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
747eb02082bSJunchao Zhang     else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
748eb02082bSJunchao Zhang     else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
749eb02082bSJunchao Zhang     else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
750eb02082bSJunchao Zhang     else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
751eb02082bSJunchao Zhang     else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
752eb02082bSJunchao Zhang     else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
753eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
754eb02082bSJunchao Zhang     else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
755eb02082bSJunchao Zhang   }
756eb02082bSJunchao Zhang #endif
75740e23c03SJunchao Zhang   PetscFunctionReturn(0);
75840e23c03SJunchao Zhang }
75940e23c03SJunchao Zhang 
760*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *)) {
76140e23c03SJunchao Zhang   PetscFunctionBegin;
762cd620004SJunchao Zhang   *ScatterAndOp = NULL;
76371438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {
76483df288dSJunchao Zhang     if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
765cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
766cd620004SJunchao Zhang     else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
767cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
768cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
769cd620004SJunchao Zhang     else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
770cd620004SJunchao Zhang     else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
771cd620004SJunchao Zhang     else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
772cd620004SJunchao Zhang     else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
773cd620004SJunchao Zhang     else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
774cd620004SJunchao Zhang     else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
775cd620004SJunchao Zhang     else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
776cd620004SJunchao Zhang     else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
777eb02082bSJunchao Zhang   }
7787fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
77971438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) {
78083df288dSJunchao Zhang     if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
781cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
782cd620004SJunchao Zhang     else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
783cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
784cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
785cd620004SJunchao Zhang     else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
786cd620004SJunchao Zhang     else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
787cd620004SJunchao Zhang     else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
788cd620004SJunchao Zhang     else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
789cd620004SJunchao Zhang     else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
790cd620004SJunchao Zhang     else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
791cd620004SJunchao Zhang     else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
792cd620004SJunchao Zhang     else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
79371438e86SJunchao Zhang   } else if (PetscMemTypeDevice(mtype) && atomic) {
79483df288dSJunchao Zhang     if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
795cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
796cd620004SJunchao Zhang     else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
797cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
798cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
799cd620004SJunchao Zhang     else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
800cd620004SJunchao Zhang     else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
801cd620004SJunchao Zhang     else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
802cd620004SJunchao Zhang     else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
803cd620004SJunchao Zhang     else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
804cd620004SJunchao Zhang     else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
805cd620004SJunchao Zhang     else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
806cd620004SJunchao Zhang     else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
807eb02082bSJunchao Zhang   }
808eb02082bSJunchao Zhang #endif
809cd620004SJunchao Zhang   PetscFunctionReturn(0);
810cd620004SJunchao Zhang }
811cd620004SJunchao Zhang 
812*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *)) {
813cd620004SJunchao Zhang   PetscFunctionBegin;
814cd620004SJunchao Zhang   *FetchAndOp = NULL;
815c9cc58a2SBarry Smith   PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
81671438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
8177fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
81871438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
81971438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd;
820cd620004SJunchao Zhang #endif
821cd620004SJunchao Zhang   PetscFunctionReturn(0);
822cd620004SJunchao Zhang }
823cd620004SJunchao Zhang 
824*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *)) {
825cd620004SJunchao Zhang   PetscFunctionBegin;
826cd620004SJunchao Zhang   *FetchAndOpLocal = NULL;
827c9cc58a2SBarry Smith   PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
82871438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
8297fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
83071438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
83171438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
832cd620004SJunchao Zhang #endif
833cd620004SJunchao Zhang   PetscFunctionReturn(0);
834cd620004SJunchao Zhang }
835cd620004SJunchao Zhang 
836*9371c9d4SSatish Balay static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) {
837cd620004SJunchao Zhang   PetscLogDouble flops;
838cd620004SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
839cd620004SJunchao Zhang 
840cd620004SJunchao Zhang   PetscFunctionBegin;
84183df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
842cd620004SJunchao Zhang     flops = bas->rootbuflen[scope] * link->bs;               /* # of roots in buffer x # of scalars in unit */
8437fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
844*9371c9d4SSatish Balay     if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops));
845*9371c9d4SSatish Balay     else
846cd620004SJunchao Zhang #endif
8479566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(flops));
848cd620004SJunchao Zhang   }
849cd620004SJunchao Zhang   PetscFunctionReturn(0);
850cd620004SJunchao Zhang }
851cd620004SJunchao Zhang 
852*9371c9d4SSatish Balay static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) {
853cd620004SJunchao Zhang   PetscLogDouble flops;
854cd620004SJunchao Zhang 
855cd620004SJunchao Zhang   PetscFunctionBegin;
85683df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
857cd620004SJunchao Zhang     flops = sf->leafbuflen[scope] * link->bs;                /* # of roots in buffer x # of scalars in unit */
8587fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
859*9371c9d4SSatish Balay     if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops));
860*9371c9d4SSatish Balay     else
861cd620004SJunchao Zhang #endif
8629566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(flops));
863cd620004SJunchao Zhang   }
864cd620004SJunchao Zhang   PetscFunctionReturn(0);
865cd620004SJunchao Zhang }
866cd620004SJunchao Zhang 
867cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
8684165533cSJose E. Roman   Input Parameters:
869cd620004SJunchao Zhang   +sf      - The StarForest
870cd620004SJunchao Zhang   .link    - The link
871cd620004SJunchao Zhang   .count   - Number of entries to unpack
872cd620004SJunchao Zhang   .start   - The first index, significent when indices=NULL
873cd620004SJunchao Zhang   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
874cd620004SJunchao Zhang   .buf     - A contiguous buffer to unpack from
875cd620004SJunchao Zhang   -op      - Operation after unpack
876cd620004SJunchao Zhang 
8774165533cSJose E. Roman   Output Parameters:
878cd620004SJunchao Zhang   .data    - The data to unpack to
879cd620004SJunchao Zhang */
880*9371c9d4SSatish Balay static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op) {
881cd620004SJunchao Zhang   PetscFunctionBegin;
882cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
883cd620004SJunchao Zhang   {
884cd620004SJunchao Zhang     PetscInt i;
885cd620004SJunchao Zhang     if (indices) {
886cd620004SJunchao Zhang       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
887cd620004SJunchao Zhang          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
888cd620004SJunchao Zhang       */
8899566063dSJacob Faibussowitsch       for (i = 0; i < count; i++) PetscCallMPI(MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op));
890cd620004SJunchao Zhang     } else {
8919566063dSJacob Faibussowitsch       PetscCallMPI(MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op));
892cd620004SJunchao Zhang     }
893cd620004SJunchao Zhang   }
894b458e8f1SJose E. Roman   PetscFunctionReturn(0);
895cd620004SJunchao Zhang #else
896cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
897cd620004SJunchao Zhang #endif
898cd620004SJunchao Zhang }
899cd620004SJunchao Zhang 
900*9371c9d4SSatish Balay static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op) {
901cd620004SJunchao Zhang   PetscFunctionBegin;
902cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
903cd620004SJunchao Zhang   {
904cd620004SJunchao Zhang     PetscInt i, disp;
905fcc7397dSJunchao Zhang     if (!srcIdx) {
9069566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op));
907fcc7397dSJunchao Zhang     } else {
908cd620004SJunchao Zhang       for (i = 0; i < count; i++) {
909fcc7397dSJunchao Zhang         disp = dstIdx ? dstIdx[i] : dstStart + i;
9109566063dSJacob Faibussowitsch         PetscCallMPI(MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op));
911fcc7397dSJunchao Zhang       }
912cd620004SJunchao Zhang     }
913cd620004SJunchao Zhang   }
914b458e8f1SJose E. Roman   PetscFunctionReturn(0);
915cd620004SJunchao Zhang #else
916cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
917cd620004SJunchao Zhang #endif
918cd620004SJunchao Zhang }
919cd620004SJunchao Zhang 
920cd620004SJunchao Zhang /*=============================================================================
921cd620004SJunchao Zhang               Pack/Unpack/Fetch/Scatter routines
922cd620004SJunchao Zhang  ============================================================================*/
923cd620004SJunchao Zhang 
924cd620004SJunchao Zhang /* Pack rootdata to rootbuf
9254165533cSJose E. Roman   Input Parameters:
926cd620004SJunchao Zhang   + sf       - The SF this packing works on.
927cd620004SJunchao Zhang   . link     - It gives the memtype of the roots and also provides root buffer.
928cd620004SJunchao Zhang   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
929cd620004SJunchao Zhang   - rootdata - Where to read the roots.
930cd620004SJunchao Zhang 
931cd620004SJunchao Zhang   Notes:
932cd620004SJunchao Zhang   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
93371438e86SJunchao Zhang   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
934cd620004SJunchao Zhang  */
935*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) {
936cd620004SJunchao Zhang   const PetscInt *rootindices = NULL;
937cd620004SJunchao Zhang   PetscInt        count, start;
938fcc7397dSJunchao Zhang   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
939cd620004SJunchao Zhang   PetscMemType   rootmtype                                                                                        = link->rootmtype;
940fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                              = NULL;
941fcc7397dSJunchao Zhang 
942cd620004SJunchao Zhang   PetscFunctionBegin;
9439566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
94471438e86SJunchao Zhang   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
9459566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
9469566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetPack(link, rootmtype, &Pack));
9479566063dSJacob Faibussowitsch     PetscCall((*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
948cd620004SJunchao Zhang   }
9499566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
950cd620004SJunchao Zhang   PetscFunctionReturn(0);
951cd620004SJunchao Zhang }
952cd620004SJunchao Zhang 
953cd620004SJunchao Zhang /* Pack leafdata to leafbuf */
954*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) {
955cd620004SJunchao Zhang   const PetscInt *leafindices = NULL;
956cd620004SJunchao Zhang   PetscInt        count, start;
957fcc7397dSJunchao Zhang   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
958cd620004SJunchao Zhang   PetscMemType   leafmtype                                                                                        = link->leafmtype;
959fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                              = NULL;
960cd620004SJunchao Zhang 
961cd620004SJunchao Zhang   PetscFunctionBegin;
9629566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
96371438e86SJunchao Zhang   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
9649566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
9659566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetPack(link, leafmtype, &Pack));
9669566063dSJacob Faibussowitsch     PetscCall((*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
967cd620004SJunchao Zhang   }
9689566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
969cd620004SJunchao Zhang   PetscFunctionReturn(0);
970cd620004SJunchao Zhang }
971cd620004SJunchao Zhang 
97271438e86SJunchao Zhang /* Pack rootdata to rootbuf, which are in the same memory space */
973*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) {
97471438e86SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
97571438e86SJunchao Zhang 
97671438e86SJunchao Zhang   PetscFunctionBegin;
97771438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
9789566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
9799566063dSJacob Faibussowitsch     if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
98071438e86SJunchao Zhang   }
9819566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
9829566063dSJacob Faibussowitsch   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf, link, scope, rootdata));
9839566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
98471438e86SJunchao Zhang   PetscFunctionReturn(0);
98571438e86SJunchao Zhang }
98671438e86SJunchao Zhang /* Pack leafdata to leafbuf, which are in the same memory space */
987*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) {
98871438e86SJunchao Zhang   PetscFunctionBegin;
98971438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
9909566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
9919566063dSJacob Faibussowitsch     if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
99271438e86SJunchao Zhang   }
9939566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
9949566063dSJacob Faibussowitsch   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata));
9959566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
99671438e86SJunchao Zhang   PetscFunctionReturn(0);
99771438e86SJunchao Zhang }
99871438e86SJunchao Zhang 
999*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) {
1000cd620004SJunchao Zhang   const PetscInt *rootindices = NULL;
1001cd620004SJunchao Zhang   PetscInt        count, start;
1002cd620004SJunchao Zhang   PetscSF_Basic  *bas                                                                                                    = (PetscSF_Basic *)sf->data;
1003fcc7397dSJunchao Zhang   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1004cd620004SJunchao Zhang   PetscMemType   rootmtype                                                                                               = link->rootmtype;
1005fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                                     = NULL;
1006cd620004SJunchao Zhang 
1007cd620004SJunchao Zhang   PetscFunctionBegin;
100871438e86SJunchao Zhang   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
10099566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp));
1010cd620004SJunchao Zhang     if (UnpackAndOp) {
10119566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
10129566063dSJacob Faibussowitsch       PetscCall((*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
1013cd620004SJunchao Zhang     } else {
10149566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices));
10159566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op));
1016cd620004SJunchao Zhang     }
1017cd620004SJunchao Zhang   }
10189566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op));
1019cd620004SJunchao Zhang   PetscFunctionReturn(0);
1020cd620004SJunchao Zhang }
1021cd620004SJunchao Zhang 
1022*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) {
1023cd620004SJunchao Zhang   const PetscInt *leafindices = NULL;
1024cd620004SJunchao Zhang   PetscInt        count, start;
1025fcc7397dSJunchao Zhang   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1026cd620004SJunchao Zhang   PetscMemType   leafmtype                                                                                               = link->leafmtype;
1027fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                                     = NULL;
1028cd620004SJunchao Zhang 
1029cd620004SJunchao Zhang   PetscFunctionBegin;
103071438e86SJunchao Zhang   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
10319566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp));
1032cd620004SJunchao Zhang     if (UnpackAndOp) {
10339566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
10349566063dSJacob Faibussowitsch       PetscCall((*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
1035cd620004SJunchao Zhang     } else {
10369566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices));
10379566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op));
1038cd620004SJunchao Zhang     }
1039cd620004SJunchao Zhang   }
10409566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op));
104171438e86SJunchao Zhang   PetscFunctionReturn(0);
104271438e86SJunchao Zhang }
104371438e86SJunchao Zhang /* Unpack rootbuf to rootdata, which are in the same memory space */
1044*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) {
104571438e86SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
104671438e86SJunchao Zhang 
104771438e86SJunchao Zhang   PetscFunctionBegin;
10489566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
10499566063dSJacob Faibussowitsch   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op));
10509566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
105171438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
10529566063dSJacob Faibussowitsch     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
10539566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
105471438e86SJunchao Zhang   }
1055cd620004SJunchao Zhang   PetscFunctionReturn(0);
1056cd620004SJunchao Zhang }
1057cd620004SJunchao Zhang 
105871438e86SJunchao Zhang /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1059*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) {
106071438e86SJunchao Zhang   PetscFunctionBegin;
10619566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
10629566063dSJacob Faibussowitsch   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op));
10639566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
106471438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
10659566063dSJacob Faibussowitsch     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
10669566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
106771438e86SJunchao Zhang   }
106871438e86SJunchao Zhang   PetscFunctionReturn(0);
106971438e86SJunchao Zhang }
107071438e86SJunchao Zhang 
107171438e86SJunchao Zhang /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1072*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op) {
1073cd620004SJunchao Zhang   const PetscInt *rootindices = NULL;
1074cd620004SJunchao Zhang   PetscInt        count, start;
1075cd620004SJunchao Zhang   PetscSF_Basic  *bas                                                                                             = (PetscSF_Basic *)sf->data;
1076fcc7397dSJunchao Zhang   PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL;
1077cd620004SJunchao Zhang   PetscMemType   rootmtype                                                                                        = link->rootmtype;
1078fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                              = NULL;
1079cd620004SJunchao Zhang 
1080cd620004SJunchao Zhang   PetscFunctionBegin;
10819566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
108271438e86SJunchao Zhang   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1083cd620004SJunchao Zhang     /* Do FetchAndOp on rootdata with rootbuf */
10849566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp));
10859566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices));
10869566063dSJacob Faibussowitsch     PetscCall((*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype]));
1087cd620004SJunchao Zhang   }
10889566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op));
10899566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
1090cd620004SJunchao Zhang   PetscFunctionReturn(0);
1091cd620004SJunchao Zhang }
1092cd620004SJunchao Zhang 
1093*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op) {
1094cd620004SJunchao Zhang   const PetscInt *rootindices = NULL, *leafindices = NULL;
1095cd620004SJunchao Zhang   PetscInt        count, rootstart, leafstart;
1096cd620004SJunchao Zhang   PetscSF_Basic  *bas                                                                                                                                                 = (PetscSF_Basic *)sf->data;
1097fcc7397dSJunchao Zhang   PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL;
109871438e86SJunchao Zhang   PetscMemType   rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype;
1099fcc7397dSJunchao Zhang   PetscSFPackOpt leafopt = NULL, rootopt = NULL;
110071438e86SJunchao Zhang   PetscInt       buflen = sf->leafbuflen[PETSCSF_LOCAL];
110171438e86SJunchao Zhang   char          *srcbuf = NULL, *dstbuf = NULL;
110271438e86SJunchao Zhang   PetscBool      dstdups;
1103cd620004SJunchao Zhang 
1104362febeeSStefano Zampini   PetscFunctionBegin;
110571438e86SJunchao Zhang   if (!buflen) PetscFunctionReturn(0);
110671438e86SJunchao Zhang   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
110771438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
11089566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata));
110971438e86SJunchao Zhang       srcmtype = rootmtype;
111071438e86SJunchao Zhang       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
111171438e86SJunchao Zhang       dstmtype = leafmtype;
111271438e86SJunchao Zhang       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
111371438e86SJunchao Zhang     } else {
11149566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata));
111571438e86SJunchao Zhang       srcmtype = leafmtype;
111671438e86SJunchao Zhang       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
111771438e86SJunchao Zhang       dstmtype = rootmtype;
111871438e86SJunchao Zhang       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
111971438e86SJunchao Zhang     }
11209566063dSJacob Faibussowitsch     PetscCall((*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes));
112171438e86SJunchao Zhang     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
11229566063dSJacob Faibussowitsch     if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link));
112371438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
11249566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op));
1125cd620004SJunchao Zhang     } else {
11269566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op));
112771438e86SJunchao Zhang     }
112871438e86SJunchao Zhang   } else {
112971438e86SJunchao Zhang     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
113071438e86SJunchao Zhang     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
11319566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp));
1132cd620004SJunchao Zhang     if (ScatterAndOp) {
11339566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
11349566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
113571438e86SJunchao Zhang       if (direction == PETSCSF_ROOT2LEAF) {
11369566063dSJacob Faibussowitsch         PetscCall((*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata));
1137cd620004SJunchao Zhang       } else {
11389566063dSJacob Faibussowitsch         PetscCall((*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata));
113971438e86SJunchao Zhang       }
1140cd620004SJunchao Zhang     } else {
11419566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
11429566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
114371438e86SJunchao Zhang       if (direction == PETSCSF_ROOT2LEAF) {
11449566063dSJacob Faibussowitsch         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op));
114571438e86SJunchao Zhang       } else {
11469566063dSJacob Faibussowitsch         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op));
1147cd620004SJunchao Zhang       }
1148cd620004SJunchao Zhang     }
114971438e86SJunchao Zhang   }
1150cd620004SJunchao Zhang   PetscFunctionReturn(0);
1151cd620004SJunchao Zhang }
1152cd620004SJunchao Zhang 
1153cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */
1154*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op) {
1155cd620004SJunchao Zhang   const PetscInt *rootindices = NULL, *leafindices = NULL;
1156cd620004SJunchao Zhang   PetscInt        count, rootstart, leafstart;
1157cd620004SJunchao Zhang   PetscSF_Basic  *bas                                                                                                                                                            = (PetscSF_Basic *)sf->data;
1158fcc7397dSJunchao Zhang   PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1159cd620004SJunchao Zhang   const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype;
1160fcc7397dSJunchao Zhang   PetscSFPackOpt     leafopt = NULL, rootopt = NULL;
1161cd620004SJunchao Zhang 
1162cd620004SJunchao Zhang   PetscFunctionBegin;
1163cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1164cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1165cd620004SJunchao Zhang     /* The local communication has to go through pack and unpack */
1166cd620004SJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1167cd620004SJunchao Zhang   } else {
11689566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
11699566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
11709566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal));
11719566063dSJacob Faibussowitsch     PetscCall((*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate));
1172cd620004SJunchao Zhang   }
117340e23c03SJunchao Zhang   PetscFunctionReturn(0);
117440e23c03SJunchao Zhang }
117540e23c03SJunchao Zhang 
117640e23c03SJunchao Zhang /*
1177cd620004SJunchao Zhang   Create per-rank pack/unpack optimizations based on indice patterns
117840e23c03SJunchao Zhang 
117940e23c03SJunchao Zhang    Input Parameters:
1180fcc7397dSJunchao Zhang   +  n       - Number of destination ranks
1181eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1182b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
118340e23c03SJunchao Zhang 
118440e23c03SJunchao Zhang    Output Parameters:
1185cd620004SJunchao Zhang   +  opt     - Pack optimizations. NULL if no optimizations.
118640e23c03SJunchao Zhang */
1187*9371c9d4SSatish Balay PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out) {
1188fcc7397dSJunchao Zhang   PetscInt       r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y;
1189fcc7397dSJunchao Zhang   PetscBool      optimizable = PETSC_TRUE;
119040e23c03SJunchao Zhang   PetscSFPackOpt opt;
119140e23c03SJunchao Zhang 
119240e23c03SJunchao Zhang   PetscFunctionBegin;
11939566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1, &opt));
11949566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(7 * n + 2, &opt->array));
1195fcc7397dSJunchao Zhang   opt->n = opt->array[0] = n;
1196fcc7397dSJunchao Zhang   opt->offset            = opt->array + 1;
1197fcc7397dSJunchao Zhang   opt->start             = opt->array + n + 2;
1198fcc7397dSJunchao Zhang   opt->dx                = opt->array + 2 * n + 2;
1199fcc7397dSJunchao Zhang   opt->dy                = opt->array + 3 * n + 2;
1200fcc7397dSJunchao Zhang   opt->dz                = opt->array + 4 * n + 2;
1201fcc7397dSJunchao Zhang   opt->X                 = opt->array + 5 * n + 2;
1202fcc7397dSJunchao Zhang   opt->Y                 = opt->array + 6 * n + 2;
1203fcc7397dSJunchao Zhang 
1204fcc7397dSJunchao Zhang   for (r = 0; r < n; r++) {            /* For each destination rank */
1205fcc7397dSJunchao Zhang     m     = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1206fcc7397dSJunchao Zhang     p     = offset[r];
1207fcc7397dSJunchao Zhang     start = idx[p]; /* First index for this rank */
1208fcc7397dSJunchao Zhang     p++;
1209fcc7397dSJunchao Zhang 
1210fcc7397dSJunchao Zhang     /* Search in X dimension */
1211fcc7397dSJunchao Zhang     for (dx = 1; dx < m; dx++, p++) {
1212fcc7397dSJunchao Zhang       if (start + dx != idx[p]) break;
1213b23bfdefSJunchao Zhang     }
1214b23bfdefSJunchao Zhang 
1215fcc7397dSJunchao Zhang     dydz = m / dx;
1216fcc7397dSJunchao Zhang     X    = dydz > 1 ? (idx[p] - start) : dx;
1217fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1218*9371c9d4SSatish Balay     if (m % dx || X <= 0) {
1219*9371c9d4SSatish Balay       optimizable = PETSC_FALSE;
1220*9371c9d4SSatish Balay       goto finish;
1221*9371c9d4SSatish Balay     }
1222fcc7397dSJunchao Zhang     for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */
1223fcc7397dSJunchao Zhang       for (i = 0; i < dx; i++, p++) {
1224fcc7397dSJunchao Zhang         if (start + X * dy + i != idx[p]) {
1225*9371c9d4SSatish Balay           if (i) {
1226*9371c9d4SSatish Balay             optimizable = PETSC_FALSE;
1227*9371c9d4SSatish Balay             goto finish;
1228*9371c9d4SSatish Balay           } /* The pattern is violated in the middle of an x-walk */
1229*9371c9d4SSatish Balay           else
1230*9371c9d4SSatish Balay             goto Z_dimension;
123140e23c03SJunchao Zhang         }
123240e23c03SJunchao Zhang       }
123340e23c03SJunchao Zhang     }
123440e23c03SJunchao Zhang 
1235fcc7397dSJunchao Zhang   Z_dimension:
1236fcc7397dSJunchao Zhang     dz = m / (dx * dy);
1237fcc7397dSJunchao Zhang     Y  = dz > 1 ? (idx[p] - start) / X : dy;
1238fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1239*9371c9d4SSatish Balay     if (m % (dx * dy) || Y <= 0) {
1240*9371c9d4SSatish Balay       optimizable = PETSC_FALSE;
1241*9371c9d4SSatish Balay       goto finish;
1242*9371c9d4SSatish Balay     }
1243fcc7397dSJunchao Zhang     for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1244fcc7397dSJunchao Zhang       for (j = 0; j < dy; j++) {
1245fcc7397dSJunchao Zhang         for (i = 0; i < dx; i++, p++) {
1246*9371c9d4SSatish Balay           if (start + X * Y * k + X * j + i != idx[p]) {
1247*9371c9d4SSatish Balay             optimizable = PETSC_FALSE;
1248*9371c9d4SSatish Balay             goto finish;
1249*9371c9d4SSatish Balay           }
125040e23c03SJunchao Zhang         }
125140e23c03SJunchao Zhang       }
125240e23c03SJunchao Zhang     }
1253fcc7397dSJunchao Zhang     opt->start[r] = start;
1254fcc7397dSJunchao Zhang     opt->dx[r]    = dx;
1255fcc7397dSJunchao Zhang     opt->dy[r]    = dy;
1256fcc7397dSJunchao Zhang     opt->dz[r]    = dz;
1257fcc7397dSJunchao Zhang     opt->X[r]     = X;
1258fcc7397dSJunchao Zhang     opt->Y[r]     = Y;
125940e23c03SJunchao Zhang   }
126040e23c03SJunchao Zhang 
1261fcc7397dSJunchao Zhang finish:
1262fcc7397dSJunchao Zhang   /* If not optimizable, free arrays to save memory */
1263fcc7397dSJunchao Zhang   if (!n || !optimizable) {
12649566063dSJacob Faibussowitsch     PetscCall(PetscFree(opt->array));
12659566063dSJacob Faibussowitsch     PetscCall(PetscFree(opt));
126640e23c03SJunchao Zhang     *out = NULL;
1267fcc7397dSJunchao Zhang   } else {
1268fcc7397dSJunchao Zhang     opt->offset[0] = 0;
1269fcc7397dSJunchao Zhang     for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r];
1270fcc7397dSJunchao Zhang     *out = opt;
1271fcc7397dSJunchao Zhang   }
127240e23c03SJunchao Zhang   PetscFunctionReturn(0);
127340e23c03SJunchao Zhang }
127440e23c03SJunchao Zhang 
1275*9371c9d4SSatish Balay static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out) {
127640e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
127740e23c03SJunchao Zhang 
127840e23c03SJunchao Zhang   PetscFunctionBegin;
127940e23c03SJunchao Zhang   if (opt) {
12809566063dSJacob Faibussowitsch     PetscCall(PetscSFFree(sf, mtype, opt->array));
12819566063dSJacob Faibussowitsch     PetscCall(PetscFree(opt));
128240e23c03SJunchao Zhang     *out = NULL;
128340e23c03SJunchao Zhang   }
128440e23c03SJunchao Zhang   PetscFunctionReturn(0);
128540e23c03SJunchao Zhang }
1286cd620004SJunchao Zhang 
1287*9371c9d4SSatish Balay PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) {
1288cd620004SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1289cd620004SJunchao Zhang   PetscInt       i, j;
1290cd620004SJunchao Zhang 
1291cd620004SJunchao Zhang   PetscFunctionBegin;
1292cd620004SJunchao Zhang   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1293cd620004SJunchao Zhang   for (i = 0; i < 2; i++) { /* Set defaults */
1294cd620004SJunchao Zhang     sf->leafstart[i]   = 0;
1295cd620004SJunchao Zhang     sf->leafcontig[i]  = PETSC_TRUE;
1296cd620004SJunchao Zhang     sf->leafdups[i]    = PETSC_FALSE;
1297cd620004SJunchao Zhang     bas->rootstart[i]  = 0;
1298cd620004SJunchao Zhang     bas->rootcontig[i] = PETSC_TRUE;
1299cd620004SJunchao Zhang     bas->rootdups[i]   = PETSC_FALSE;
1300cd620004SJunchao Zhang   }
1301cd620004SJunchao Zhang 
1302cd620004SJunchao Zhang   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1303cd620004SJunchao Zhang   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1304cd620004SJunchao Zhang 
1305cd620004SJunchao Zhang   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1306cd620004SJunchao Zhang   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1307cd620004SJunchao Zhang 
1308cd620004SJunchao Zhang   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1309cd620004SJunchao Zhang   for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */
1310*9371c9d4SSatish Balay     if (sf->rmine[i] != sf->leafstart[0] + i) {
1311*9371c9d4SSatish Balay       sf->leafcontig[0] = PETSC_FALSE;
1312*9371c9d4SSatish Balay       break;
1313*9371c9d4SSatish Balay     }
1314cd620004SJunchao Zhang   }
1315cd620004SJunchao Zhang   for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */
1316*9371c9d4SSatish Balay     if (sf->rmine[i] != sf->leafstart[1] + j) {
1317*9371c9d4SSatish Balay       sf->leafcontig[1] = PETSC_FALSE;
1318*9371c9d4SSatish Balay       break;
1319*9371c9d4SSatish Balay     }
1320cd620004SJunchao Zhang   }
1321cd620004SJunchao Zhang 
1322cd620004SJunchao Zhang   /* If not, see if we can have per-rank optimizations by doing index analysis */
13239566063dSJacob Faibussowitsch   if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]));
13249566063dSJacob Faibussowitsch   if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1]));
1325cd620004SJunchao Zhang 
1326cd620004SJunchao Zhang   /* Are root indices for self and remote contiguous? */
1327cd620004SJunchao Zhang   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1328cd620004SJunchao Zhang   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1329cd620004SJunchao Zhang 
1330cd620004SJunchao Zhang   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1331cd620004SJunchao Zhang   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1332cd620004SJunchao Zhang 
1333cd620004SJunchao Zhang   for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) {
1334*9371c9d4SSatish Balay     if (bas->irootloc[i] != bas->rootstart[0] + i) {
1335*9371c9d4SSatish Balay       bas->rootcontig[0] = PETSC_FALSE;
1336*9371c9d4SSatish Balay       break;
1337*9371c9d4SSatish Balay     }
1338cd620004SJunchao Zhang   }
1339cd620004SJunchao Zhang   for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) {
1340*9371c9d4SSatish Balay     if (bas->irootloc[i] != bas->rootstart[1] + j) {
1341*9371c9d4SSatish Balay       bas->rootcontig[1] = PETSC_FALSE;
1342*9371c9d4SSatish Balay       break;
1343*9371c9d4SSatish Balay     }
1344cd620004SJunchao Zhang   }
1345cd620004SJunchao Zhang 
13469566063dSJacob Faibussowitsch   if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]));
13479566063dSJacob Faibussowitsch   if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]));
1348cd620004SJunchao Zhang 
1349cd620004SJunchao Zhang   /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1350013b3241SStefano Zampini   if (PetscDefined(HAVE_DEVICE)) {
1351013b3241SStefano Zampini     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
13529566063dSJacob Faibussowitsch     if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]));
13539566063dSJacob Faibussowitsch     if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine + sf->roffset[sf->ndranks], &sf->leafdups[1]));
13549566063dSJacob Faibussowitsch     if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]));
13559566063dSJacob Faibussowitsch     if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc + bas->ioffset[bas->ndiranks], &bas->rootdups[1]));
1356013b3241SStefano Zampini   }
1357cd620004SJunchao Zhang   PetscFunctionReturn(0);
1358cd620004SJunchao Zhang }
1359cd620004SJunchao Zhang 
1360*9371c9d4SSatish Balay PetscErrorCode PetscSFResetPackFields(PetscSF sf) {
1361cd620004SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1362cd620004SJunchao Zhang   PetscInt       i;
1363cd620004SJunchao Zhang 
1364cd620004SJunchao Zhang   PetscFunctionBegin;
1365cd620004SJunchao Zhang   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
13669566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i]));
13679566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i]));
13687fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
13699566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i]));
13709566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i]));
13717fd2d3dbSJunchao Zhang #endif
1372cd620004SJunchao Zhang   }
1373cd620004SJunchao Zhang   PetscFunctionReturn(0);
1374cd620004SJunchao Zhang }
1375