xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 46233b442a14b6faff55b7d8bbd3923d44c577ac)
1*46233b44SBarry Smith #include <petsc/private/sfimpl.h>
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
57fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
67fd2d3dbSJunchao Zhang 
740e23c03SJunchao Zhang /*
840e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
940e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1040e23c03SJunchao Zhang  */
1140e23c03SJunchao Zhang 
12cd620004SJunchao Zhang #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d
1340e23c03SJunchao Zhang 
14cd620004SJunchao Zhang /* Operations working like s += t */
159371c9d4SSatish Balay #define OP_BINARY(op, s, t) \
16d71ae5a4SJacob Faibussowitsch   do { \
17d71ae5a4SJacob Faibussowitsch     (s) = (s)op(t); \
18d71ae5a4SJacob Faibussowitsch   } while (0) /* binary ops in the middle such as +, *, && etc. */
199371c9d4SSatish Balay #define OP_FUNCTION(op, s, t) \
20d71ae5a4SJacob Faibussowitsch   do { \
21d71ae5a4SJacob Faibussowitsch     (s) = op((s), (t)); \
22d71ae5a4SJacob Faibussowitsch   } while (0) /* ops like a function, such as PetscMax, PetscMin */
239371c9d4SSatish Balay #define OP_LXOR(op, s, t) \
24d71ae5a4SJacob Faibussowitsch   do { \
25d71ae5a4SJacob Faibussowitsch     (s) = (!(s)) != (!(t)); \
26d71ae5a4SJacob Faibussowitsch   } while (0) /* logical exclusive OR */
279371c9d4SSatish Balay #define OP_ASSIGN(op, s, t) \
28d71ae5a4SJacob Faibussowitsch   do { \
29d71ae5a4SJacob Faibussowitsch     (s) = (t); \
30d71ae5a4SJacob Faibussowitsch   } while (0)
31cd620004SJunchao Zhang /* Ref MPI MAXLOC */
32cd620004SJunchao Zhang #define OP_XLOC(op, s, t) \
33cd620004SJunchao Zhang   do { \
34cd620004SJunchao Zhang     if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \
35cd620004SJunchao Zhang     else if (!((s).u op(t).u)) s = t; \
36cd620004SJunchao Zhang   } while (0)
3740e23c03SJunchao Zhang 
3840e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
3940e23c03SJunchao Zhang 
4040e23c03SJunchao Zhang    Arguments of the macro:
41b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
42fcc7397dSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bsz.
43b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
4440e23c03SJunchao Zhang 
4540e23c03SJunchao Zhang    Arguments of the Pack routine:
46cd620004SJunchao Zhang    +count     Number of indices in idx[].
47fcc7397dSJunchao Zhang    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
48fcc7397dSJunchao Zhang    .opt       Per-pack optimization plan. NULL means no such plan.
49fcc7397dSJunchao Zhang    .idx       Indices of entries to packed.
50eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
51cd620004SJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
52cd620004SJunchao Zhang    -packed    Address of the packed data.
5340e23c03SJunchao Zhang  */
54b23bfdefSJunchao Zhang #define DEF_PackFunc(Type, BS, EQ) \
55d71ae5a4SJacob Faibussowitsch   static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) \
56d71ae5a4SJacob Faibussowitsch   { \
57b23bfdefSJunchao Zhang     const Type    *u = (const Type *)unpacked, *u2; \
58b23bfdefSJunchao Zhang     Type          *p = (Type *)packed, *p2; \
59fcc7397dSJunchao Zhang     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
60fcc7397dSJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
61b23bfdefSJunchao Zhang     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
6240e23c03SJunchao Zhang     PetscFunctionBegin; \
639566063dSJacob Faibussowitsch     if (!idx) PetscCall(PetscArraycpy(p, u + start * MBS, MBS * count)); /* idx[] are contiguous */ \
649371c9d4SSatish Balay     else if (opt) { /* has optimizations available */ p2 = p; \
65fcc7397dSJunchao Zhang       for (r = 0; r < opt->n; r++) { \
66fcc7397dSJunchao Zhang         u2 = u + opt->start[r] * MBS; \
67fcc7397dSJunchao Zhang         X  = opt->X[r]; \
68fcc7397dSJunchao Zhang         Y  = opt->Y[r]; \
69fcc7397dSJunchao Zhang         for (k = 0; k < opt->dz[r]; k++) \
70fcc7397dSJunchao Zhang           for (j = 0; j < opt->dy[r]; j++) { \
719566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS)); \
72fcc7397dSJunchao Zhang             p2 += opt->dx[r] * MBS; \
73fcc7397dSJunchao Zhang           } \
74fcc7397dSJunchao Zhang       } \
75fcc7397dSJunchao Zhang     } else { \
76b23bfdefSJunchao Zhang       for (i = 0; i < count; i++) \
77eb02082bSJunchao Zhang         for (j = 0; j < M; j++)    /* Decent compilers should eliminate this loop when M = const 1 */ \
78eb02082bSJunchao Zhang           for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
79b23bfdefSJunchao Zhang             p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \
8040e23c03SJunchao Zhang     } \
813ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS); \
8240e23c03SJunchao Zhang   }
8340e23c03SJunchao Zhang 
84cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
85cd620004SJunchao Zhang                 and inserts into a sparse array.
8640e23c03SJunchao Zhang 
8740e23c03SJunchao Zhang    Arguments:
88b23bfdefSJunchao Zhang   .Type       Type of the data
8940e23c03SJunchao Zhang   .BS         Block size for vectorization
90b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
9140e23c03SJunchao Zhang 
9240e23c03SJunchao Zhang   Notes:
9340e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
9440e23c03SJunchao Zhang  */
95cd620004SJunchao Zhang #define DEF_UnpackFunc(Type, BS, EQ) \
96d71ae5a4SJacob Faibussowitsch   static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
97d71ae5a4SJacob Faibussowitsch   { \
98b23bfdefSJunchao Zhang     Type          *u = (Type *)unpacked, *u2; \
99fcc7397dSJunchao Zhang     const Type    *p = (const Type *)packed; \
100fcc7397dSJunchao Zhang     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
101fcc7397dSJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
102b23bfdefSJunchao Zhang     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
10340e23c03SJunchao Zhang     PetscFunctionBegin; \
104b23bfdefSJunchao Zhang     if (!idx) { \
105fcc7397dSJunchao Zhang       u += start * MBS; \
1069566063dSJacob Faibussowitsch       if (u != p) PetscCall(PetscArraycpy(u, p, count *MBS)); \
107fcc7397dSJunchao Zhang     } else if (opt) { /* has optimizations available */ \
108fcc7397dSJunchao Zhang       for (r = 0; r < opt->n; r++) { \
109fcc7397dSJunchao Zhang         u2 = u + opt->start[r] * MBS; \
110fcc7397dSJunchao Zhang         X  = opt->X[r]; \
111fcc7397dSJunchao Zhang         Y  = opt->Y[r]; \
112fcc7397dSJunchao Zhang         for (k = 0; k < opt->dz[r]; k++) \
113fcc7397dSJunchao Zhang           for (j = 0; j < opt->dy[r]; j++) { \
1149566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS)); \
115fcc7397dSJunchao Zhang             p += opt->dx[r] * MBS; \
116fcc7397dSJunchao Zhang           } \
117fcc7397dSJunchao Zhang       } \
118fcc7397dSJunchao Zhang     } else { \
119b23bfdefSJunchao Zhang       for (i = 0; i < count; i++) \
120b23bfdefSJunchao Zhang         for (j = 0; j < M; j++) \
121cd620004SJunchao Zhang           for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \
12240e23c03SJunchao Zhang     } \
1233ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS); \
12440e23c03SJunchao Zhang   }
12540e23c03SJunchao Zhang 
126cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
12740e23c03SJunchao Zhang 
12840e23c03SJunchao Zhang    Arguments:
129cd620004SJunchao Zhang   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
130b23bfdefSJunchao Zhang   .Type       Type of the data
13140e23c03SJunchao Zhang   .BS         Block size for vectorization
132b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
133cd620004SJunchao Zhang   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
134cd620004SJunchao Zhang   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
13540e23c03SJunchao Zhang  */
136cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \
137d71ae5a4SJacob Faibussowitsch   static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
138d71ae5a4SJacob Faibussowitsch   { \
139cd620004SJunchao Zhang     Type          *u = (Type *)unpacked, *u2; \
140fcc7397dSJunchao Zhang     const Type    *p = (const Type *)packed; \
141fcc7397dSJunchao Zhang     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
142fcc7397dSJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
143b23bfdefSJunchao Zhang     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
14440e23c03SJunchao Zhang     PetscFunctionBegin; \
145b23bfdefSJunchao Zhang     if (!idx) { \
146fcc7397dSJunchao Zhang       u += start * MBS; \
147cd620004SJunchao Zhang       for (i = 0; i < count; i++) \
148cd620004SJunchao Zhang         for (j = 0; j < M; j++) \
1499371c9d4SSatish Balay           for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
150fcc7397dSJunchao Zhang     } else if (opt) { /* idx[] has patterns */ \
151fcc7397dSJunchao Zhang       for (r = 0; r < opt->n; r++) { \
152fcc7397dSJunchao Zhang         u2 = u + opt->start[r] * MBS; \
153fcc7397dSJunchao Zhang         X  = opt->X[r]; \
154fcc7397dSJunchao Zhang         Y  = opt->Y[r]; \
155fcc7397dSJunchao Zhang         for (k = 0; k < opt->dz[r]; k++) \
156fcc7397dSJunchao Zhang           for (j = 0; j < opt->dy[r]; j++) { \
157fcc7397dSJunchao Zhang             for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \
158fcc7397dSJunchao Zhang             p += opt->dx[r] * MBS; \
159fcc7397dSJunchao Zhang           } \
160fcc7397dSJunchao Zhang       } \
161fcc7397dSJunchao Zhang     } else { \
162cd620004SJunchao Zhang       for (i = 0; i < count; i++) \
163cd620004SJunchao Zhang         for (j = 0; j < M; j++) \
1649371c9d4SSatish Balay           for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
165cd620004SJunchao Zhang     } \
1663ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS); \
167cd620004SJunchao Zhang   }
168cd620004SJunchao Zhang 
169cd620004SJunchao Zhang #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \
170d71ae5a4SJacob Faibussowitsch   static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) \
171d71ae5a4SJacob Faibussowitsch   { \
172fcc7397dSJunchao Zhang     Type          *u = (Type *)unpacked, *p = (Type *)packed, tmp; \
173fcc7397dSJunchao Zhang     PetscInt       i, j, k, r, l, bs = link->bs; \
174fcc7397dSJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; \
175fcc7397dSJunchao Zhang     const PetscInt MBS = M * BS; \
176cd620004SJunchao Zhang     PetscFunctionBegin; \
177fcc7397dSJunchao Zhang     for (i = 0; i < count; i++) { \
178fcc7397dSJunchao Zhang       r = (!idx ? start + i : idx[i]) * MBS; \
179fcc7397dSJunchao Zhang       l = i * MBS; \
180b23bfdefSJunchao Zhang       for (j = 0; j < M; j++) \
181b23bfdefSJunchao Zhang         for (k = 0; k < BS; k++) { \
182fcc7397dSJunchao Zhang           tmp = u[r + j * BS + k]; \
183fcc7397dSJunchao Zhang           OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \
184fcc7397dSJunchao Zhang           p[l + j * BS + k] = tmp; \
185cd620004SJunchao Zhang         } \
186cd620004SJunchao Zhang     } \
1873ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS); \
188cd620004SJunchao Zhang   }
189cd620004SJunchao Zhang 
190cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \
191d71ae5a4SJacob Faibussowitsch   static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) \
192d71ae5a4SJacob Faibussowitsch   { \
193fcc7397dSJunchao Zhang     const Type    *u = (const Type *)src; \
194fcc7397dSJunchao Zhang     Type          *v = (Type *)dst; \
195fcc7397dSJunchao Zhang     PetscInt       i, j, k, s, t, X, Y, bs = link->bs; \
196cd620004SJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; \
197cd620004SJunchao Zhang     const PetscInt MBS = M * BS; \
198cd620004SJunchao Zhang     PetscFunctionBegin; \
199fcc7397dSJunchao Zhang     if (!srcIdx) { /* src is contiguous */ \
200fcc7397dSJunchao Zhang       u += srcStart * MBS; \
2019566063dSJacob Faibussowitsch       PetscCall(CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u)); \
202fcc7397dSJunchao Zhang     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
203fcc7397dSJunchao Zhang       u += srcOpt->start[0] * MBS; \
204fcc7397dSJunchao Zhang       v += dstStart * MBS; \
2059371c9d4SSatish Balay       X = srcOpt->X[0]; \
2069371c9d4SSatish Balay       Y = srcOpt->Y[0]; \
207fcc7397dSJunchao Zhang       for (k = 0; k < srcOpt->dz[0]; k++) \
208fcc7397dSJunchao Zhang         for (j = 0; j < srcOpt->dy[0]; j++) { \
209fcc7397dSJunchao Zhang           for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \
210fcc7397dSJunchao Zhang           v += srcOpt->dx[0] * MBS; \
211fcc7397dSJunchao Zhang         } \
212fcc7397dSJunchao Zhang     } else { /* all other cases */ \
213fcc7397dSJunchao Zhang       for (i = 0; i < count; i++) { \
214fcc7397dSJunchao Zhang         s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \
215fcc7397dSJunchao Zhang         t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \
216cd620004SJunchao Zhang         for (j = 0; j < M; j++) \
217fcc7397dSJunchao Zhang           for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \
218fcc7397dSJunchao Zhang       } \
219cd620004SJunchao Zhang     } \
2203ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS); \
221cd620004SJunchao Zhang   }
222cd620004SJunchao Zhang 
223cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \
224d71ae5a4SJacob Faibussowitsch   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) \
225d71ae5a4SJacob Faibussowitsch   { \
226fcc7397dSJunchao Zhang     Type          *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \
227fcc7397dSJunchao Zhang     const Type    *ldata = (const Type *)leafdata; \
228fcc7397dSJunchao Zhang     PetscInt       i, j, k, r, l, bs = link->bs; \
229cd620004SJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; \
230cd620004SJunchao Zhang     const PetscInt MBS = M * BS; \
231cd620004SJunchao Zhang     PetscFunctionBegin; \
232fcc7397dSJunchao Zhang     for (i = 0; i < count; i++) { \
233fcc7397dSJunchao Zhang       r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \
234fcc7397dSJunchao Zhang       l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \
235cd620004SJunchao Zhang       for (j = 0; j < M; j++) \
236cd620004SJunchao Zhang         for (k = 0; k < BS; k++) { \
237fcc7397dSJunchao Zhang           lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \
238fcc7397dSJunchao Zhang           OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \
23940e23c03SJunchao Zhang         } \
24040e23c03SJunchao Zhang     } \
2413ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS); \
24240e23c03SJunchao Zhang   }
24340e23c03SJunchao Zhang 
244b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
245b23bfdefSJunchao Zhang #define DEF_Pack(Type, BS, EQ) \
246d71ae5a4SJacob Faibussowitsch   DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) \
247d71ae5a4SJacob Faibussowitsch   { \
248eb02082bSJunchao Zhang     link->h_Pack             = CPPJoin4(Pack, Type, BS, EQ); \
249eb02082bSJunchao Zhang     link->h_UnpackAndInsert  = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \
250cd620004SJunchao Zhang     link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \
25140e23c03SJunchao Zhang   }
25240e23c03SJunchao Zhang 
253b23bfdefSJunchao Zhang /* Add, Mult ops */
254b23bfdefSJunchao Zhang #define DEF_Add(Type, BS, EQ) \
255d71ae5a4SJacob Faibussowitsch   DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) \
256d71ae5a4SJacob Faibussowitsch   { \
257eb02082bSJunchao Zhang     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \
258eb02082bSJunchao Zhang     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult, Type, BS, EQ); \
259eb02082bSJunchao Zhang     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd, Type, BS, EQ); \
260cd620004SJunchao Zhang     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \
261cd620004SJunchao Zhang     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult, Type, BS, EQ); \
262cd620004SJunchao Zhang     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \
26340e23c03SJunchao Zhang   }
26440e23c03SJunchao Zhang 
265b23bfdefSJunchao Zhang /* Max, Min ops */
266b23bfdefSJunchao Zhang #define DEF_Cmp(Type, BS, EQ) \
267d71ae5a4SJacob Faibussowitsch   DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) \
268d71ae5a4SJacob Faibussowitsch   { \
269eb02082bSJunchao Zhang     link->h_UnpackAndMax  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
270eb02082bSJunchao Zhang     link->h_UnpackAndMin  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
271cd620004SJunchao Zhang     link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
272cd620004SJunchao Zhang     link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
273b23bfdefSJunchao Zhang   }
274b23bfdefSJunchao Zhang 
275b23bfdefSJunchao Zhang /* Logical ops.
276cd620004SJunchao Zhang   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
27740e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
27840e23c03SJunchao Zhang  */
279b23bfdefSJunchao Zhang #define DEF_Log(Type, BS, EQ) \
280d71ae5a4SJacob Faibussowitsch   DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) \
281d71ae5a4SJacob Faibussowitsch   { \
282eb02082bSJunchao Zhang     link->h_UnpackAndLAND  = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \
283eb02082bSJunchao Zhang     link->h_UnpackAndLOR   = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \
284eb02082bSJunchao Zhang     link->h_UnpackAndLXOR  = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \
285cd620004SJunchao Zhang     link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \
286cd620004SJunchao Zhang     link->h_ScatterAndLOR  = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \
287cd620004SJunchao Zhang     link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \
28840e23c03SJunchao Zhang   }
28940e23c03SJunchao Zhang 
290b23bfdefSJunchao Zhang /* Bitwise ops */
291b23bfdefSJunchao Zhang #define DEF_Bit(Type, BS, EQ) \
292d71ae5a4SJacob Faibussowitsch   DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) \
293d71ae5a4SJacob Faibussowitsch   { \
294eb02082bSJunchao Zhang     link->h_UnpackAndBAND  = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \
295eb02082bSJunchao Zhang     link->h_UnpackAndBOR   = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \
296eb02082bSJunchao Zhang     link->h_UnpackAndBXOR  = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \
297cd620004SJunchao Zhang     link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \
298cd620004SJunchao Zhang     link->h_ScatterAndBOR  = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \
299cd620004SJunchao Zhang     link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \
30040e23c03SJunchao Zhang   }
30140e23c03SJunchao Zhang 
302cd620004SJunchao Zhang /* Maxloc, Minloc ops */
303cd620004SJunchao Zhang #define DEF_Xloc(Type, BS, EQ) \
304d71ae5a4SJacob Faibussowitsch   DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) \
305d71ae5a4SJacob Faibussowitsch   { \
306cd620004SJunchao Zhang     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
307cd620004SJunchao Zhang     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
308cd620004SJunchao Zhang     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
309cd620004SJunchao Zhang     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
31040e23c03SJunchao Zhang   }
31140e23c03SJunchao Zhang 
312b23bfdefSJunchao Zhang #define DEF_IntegerType(Type, BS, EQ) \
313d71ae5a4SJacob Faibussowitsch   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) \
314d71ae5a4SJacob Faibussowitsch   { \
315b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
316b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
317b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
318b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \
319b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \
32040e23c03SJunchao Zhang   }
32140e23c03SJunchao Zhang 
322b23bfdefSJunchao Zhang #define DEF_RealType(Type, BS, EQ) \
323d71ae5a4SJacob Faibussowitsch   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) \
324d71ae5a4SJacob Faibussowitsch   { \
325b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
326b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
327b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
328b23bfdefSJunchao Zhang   }
32940e23c03SJunchao Zhang 
33040e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
331b23bfdefSJunchao Zhang   #define DEF_ComplexType(Type, BS, EQ) \
332d71ae5a4SJacob Faibussowitsch     DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) \
333d71ae5a4SJacob Faibussowitsch     { \
334b23bfdefSJunchao Zhang       CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
335b23bfdefSJunchao Zhang       CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
336b23bfdefSJunchao Zhang     }
33740e23c03SJunchao Zhang #endif
338b23bfdefSJunchao Zhang 
339b23bfdefSJunchao Zhang #define DEF_DumbType(Type, BS, EQ) \
340d71ae5a4SJacob Faibussowitsch   DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) \
341d71ae5a4SJacob Faibussowitsch   { \
342b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
343b23bfdefSJunchao Zhang   }
344b23bfdefSJunchao Zhang 
345b23bfdefSJunchao Zhang /* Maxloc, Minloc */
346cd620004SJunchao Zhang #define DEF_PairType(Type, BS, EQ) \
347d71ae5a4SJacob Faibussowitsch   DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) \
348d71ae5a4SJacob Faibussowitsch   { \
349cd620004SJunchao Zhang     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
350cd620004SJunchao Zhang     CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \
351b23bfdefSJunchao Zhang   }
352b23bfdefSJunchao Zhang 
353b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 1, 1)   /* unit = 1 MPIU_INT  */
354b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */
355b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */
356b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */
357b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */
358b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */
359b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */
360b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
361b23bfdefSJunchao Zhang 
362b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
3639371c9d4SSatish Balay   DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0)
364b23bfdefSJunchao Zhang #endif
365b23bfdefSJunchao Zhang 
366b23bfdefSJunchao Zhang   /* The typedefs are used to get a typename without space that CPPJoin can handle */
367b23bfdefSJunchao Zhang   typedef signed char SignedChar;
3689371c9d4SSatish Balay DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0)
369b23bfdefSJunchao Zhang 
370b23bfdefSJunchao Zhang   typedef unsigned char UnsignedChar;
3719371c9d4SSatish Balay DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0)
372b23bfdefSJunchao Zhang 
3739371c9d4SSatish Balay   DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0)
374b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
3759371c9d4SSatish Balay     DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0)
376b23bfdefSJunchao Zhang #endif
377b23bfdefSJunchao Zhang 
378cd620004SJunchao Zhang #define PairType(Type1, Type2) Type1##_##Type2
3799371c9d4SSatish Balay       typedef struct {
3809371c9d4SSatish Balay   int u;
3819371c9d4SSatish Balay   int i;
3829371c9d4SSatish Balay } PairType(int, int);
3839371c9d4SSatish Balay typedef struct {
3849371c9d4SSatish Balay   PetscInt u;
3859371c9d4SSatish Balay   PetscInt i;
3869371c9d4SSatish Balay } PairType(PetscInt, PetscInt);
3879371c9d4SSatish Balay DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1)
388b23bfdefSJunchao Zhang 
389b23bfdefSJunchao Zhang   /* If we don't know the basic type, we treat it as a stream of chars or ints */
3909371c9d4SSatish Balay   DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0)
391b23bfdefSJunchao Zhang 
392eb02082bSJunchao Zhang     typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
3939371c9d4SSatish Balay DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0)
39440e23c03SJunchao Zhang 
395d71ae5a4SJacob Faibussowitsch   PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link)
396d71ae5a4SJacob Faibussowitsch {
397cd620004SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
39871438e86SJunchao Zhang   PetscInt       i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8;
39971438e86SJunchao Zhang 
40071438e86SJunchao Zhang   PetscFunctionBegin;
40171438e86SJunchao Zhang   /* Destroy device-specific fields */
4029566063dSJacob Faibussowitsch   if (link->deviceinited) PetscCall((*link->Destroy)(sf, link));
40371438e86SJunchao Zhang 
40471438e86SJunchao Zhang   /* Destroy host related fields */
4059566063dSJacob Faibussowitsch   if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit));
40671438e86SJunchao Zhang   if (!link->use_nvshmem) {
40771438e86SJunchao Zhang     for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */
4089566063dSJacob Faibussowitsch       if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i]));
40971438e86SJunchao Zhang     }
4109566063dSJacob Faibussowitsch     PetscCall(PetscFree(link->reqs));
41171438e86SJunchao Zhang     for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
4129566063dSJacob Faibussowitsch       PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]));
4139566063dSJacob Faibussowitsch       PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]));
41471438e86SJunchao Zhang     }
41571438e86SJunchao Zhang   }
4169566063dSJacob Faibussowitsch   PetscCall(PetscFree(link));
4173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41871438e86SJunchao Zhang }
41971438e86SJunchao Zhang 
420d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
421d71ae5a4SJacob Faibussowitsch {
422cd620004SJunchao Zhang   PetscFunctionBegin;
4239566063dSJacob Faibussowitsch   PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata));
42471438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM)
42571438e86SJunchao Zhang   {
42671438e86SJunchao Zhang     PetscBool use_nvshmem;
4279566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem));
42871438e86SJunchao Zhang     if (use_nvshmem) {
4299566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
4303ba16761SJacob Faibussowitsch       PetscFunctionReturn(PETSC_SUCCESS);
431cd620004SJunchao Zhang     }
432cd620004SJunchao Zhang   }
4337fd2d3dbSJunchao Zhang #endif
4349566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
4353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
436cd620004SJunchao Zhang }
437cd620004SJunchao Zhang 
438d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink)
439d71ae5a4SJacob Faibussowitsch {
440cd620004SJunchao Zhang   PetscSFLink    link, *p;
44140e23c03SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
44240e23c03SJunchao Zhang 
44340e23c03SJunchao Zhang   PetscFunctionBegin;
44440e23c03SJunchao Zhang   /* Look for types in cache */
44540e23c03SJunchao Zhang   for (p = &bas->inuse; (link = *p); p = &link->next) {
44640e23c03SJunchao Zhang     PetscBool match;
4479566063dSJacob Faibussowitsch     PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
448637e6665SJunchao Zhang     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
44940e23c03SJunchao Zhang       switch (cmode) {
450d71ae5a4SJacob Faibussowitsch       case PETSC_OWN_POINTER:
451d71ae5a4SJacob Faibussowitsch         *p = link->next;
452d71ae5a4SJacob Faibussowitsch         break; /* Remove from inuse list */
453d71ae5a4SJacob Faibussowitsch       case PETSC_USE_POINTER:
454d71ae5a4SJacob Faibussowitsch         break;
455d71ae5a4SJacob Faibussowitsch       default:
456d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode");
45740e23c03SJunchao Zhang       }
45840e23c03SJunchao Zhang       *mylink = link;
4593ba16761SJacob Faibussowitsch       PetscFunctionReturn(PETSC_SUCCESS);
46040e23c03SJunchao Zhang     }
46140e23c03SJunchao Zhang   }
46240e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack");
46340e23c03SJunchao Zhang }
46440e23c03SJunchao Zhang 
465d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink)
466d71ae5a4SJacob Faibussowitsch {
46740e23c03SJunchao Zhang   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
46871438e86SJunchao Zhang   PetscSFLink    link = *mylink;
46940e23c03SJunchao Zhang 
47040e23c03SJunchao Zhang   PetscFunctionBegin;
47171438e86SJunchao Zhang   link->rootdata = NULL;
47271438e86SJunchao Zhang   link->leafdata = NULL;
47371438e86SJunchao Zhang   link->next     = bas->avail;
47471438e86SJunchao Zhang   bas->avail     = link;
47571438e86SJunchao Zhang   *mylink        = NULL;
4763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
477eb02082bSJunchao Zhang }
478eb02082bSJunchao Zhang 
4799d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
480d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
481d71ae5a4SJacob Faibussowitsch {
482cd620004SJunchao Zhang   PetscSFLink    link, *p;
4839d1c8addSJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
4849d1c8addSJunchao Zhang   PetscBool      match;
4859d1c8addSJunchao Zhang 
4869d1c8addSJunchao Zhang   PetscFunctionBegin;
487b458e8f1SJose E. Roman   if (PetscDefined(USE_DEBUG)) {
48818fb5014SJunchao Zhang     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
48918fb5014SJunchao Zhang        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
49018fb5014SJunchao Zhang     */
491637e6665SJunchao Zhang     if (rootdata || leafdata) {
4929d1c8addSJunchao Zhang       for (p = &bas->inuse; (link = *p); p = &link->next) {
4939566063dSJacob Faibussowitsch         PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
494c9cc58a2SBarry Smith         PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata, PETSC_COMM_SELF, PETSC_ERR_SUP, "Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.", rootdata, leafdata);
4959d1c8addSJunchao Zhang       }
49618fb5014SJunchao Zhang     }
497b458e8f1SJose E. Roman   }
4983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4999d1c8addSJunchao Zhang }
5009d1c8addSJunchao Zhang 
501d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
502d71ae5a4SJacob Faibussowitsch {
50320c24465SJunchao Zhang   PetscFunctionBegin;
5049566063dSJacob Faibussowitsch   if (n) PetscCall(PetscMemcpy(dst, src, n));
5053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
50620c24465SJunchao Zhang }
50720c24465SJunchao Zhang 
508d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit)
509d71ae5a4SJacob Faibussowitsch {
510b23bfdefSJunchao Zhang   PetscInt    nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
511b23bfdefSJunchao Zhang   PetscBool   is2Int, is2PetscInt;
51240e23c03SJunchao Zhang   PetscMPIInt ni, na, nd, combiner;
51340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
514b23bfdefSJunchao Zhang   PetscInt nPetscComplex = 0;
51540e23c03SJunchao Zhang #endif
51640e23c03SJunchao Zhang 
51740e23c03SJunchao Zhang   PetscFunctionBegin;
5189566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar));
5199566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar));
520b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
5219566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt));
5229566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt));
5239566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal));
52440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
5259566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex));
52640e23c03SJunchao Zhang #endif
5279566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int));
5289566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt));
529b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
5309566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Type_get_envelope(unit, &ni, &na, &nd, &combiner));
5315ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
532b23bfdefSJunchao Zhang   link->bs        = 1;                                                           /* default */
53340e23c03SJunchao Zhang 
534eb02082bSJunchao Zhang   if (is2Int) {
535cd620004SJunchao Zhang     PackInit_PairType_int_int_1_1(link);
536eb02082bSJunchao Zhang     link->bs        = 1;
537eb02082bSJunchao Zhang     link->unitbytes = 2 * sizeof(int);
5385ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
539eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
5405ad15460SJunchao Zhang     link->unit      = MPI_2INT;
541eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
542cd620004SJunchao Zhang     PackInit_PairType_PetscInt_PetscInt_1_1(link);
543eb02082bSJunchao Zhang     link->bs        = 1;
544eb02082bSJunchao Zhang     link->unitbytes = 2 * sizeof(PetscInt);
545eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
5465ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
5475ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
548eb02082bSJunchao Zhang   } else if (nPetscReal) {
5499371c9d4SSatish Balay     if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link);
5509371c9d4SSatish Balay     else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link);
5519371c9d4SSatish Balay     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link);
5529371c9d4SSatish Balay     else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link);
5539371c9d4SSatish Balay     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link);
5549371c9d4SSatish Balay     else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link);
5559371c9d4SSatish Balay     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link);
5569371c9d4SSatish Balay     else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link);
557b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
558eb02082bSJunchao Zhang     link->unitbytes = nPetscReal * sizeof(PetscReal);
55940e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
5609371c9d4SSatish Balay     if (link->bs == 1) {
5619371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
5629371c9d4SSatish Balay       link->unit      = MPIU_REAL;
5639371c9d4SSatish Balay     }
564b23bfdefSJunchao Zhang   } else if (nPetscInt) {
5659371c9d4SSatish Balay     if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link);
5669371c9d4SSatish Balay     else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
5679371c9d4SSatish Balay     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link);
5689371c9d4SSatish Balay     else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
5699371c9d4SSatish Balay     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link);
5709371c9d4SSatish Balay     else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
5719371c9d4SSatish Balay     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link);
5729371c9d4SSatish Balay     else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
573b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
574eb02082bSJunchao Zhang     link->unitbytes = nPetscInt * sizeof(PetscInt);
575b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
5769371c9d4SSatish Balay     if (link->bs == 1) {
5779371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
5789371c9d4SSatish Balay       link->unit      = MPIU_INT;
5799371c9d4SSatish Balay     }
580b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
581b23bfdefSJunchao Zhang   } else if (nInt) {
5829371c9d4SSatish Balay     if (nInt == 8) PackInit_IntegerType_int_8_1(link);
5839371c9d4SSatish Balay     else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link);
5849371c9d4SSatish Balay     else if (nInt == 4) PackInit_IntegerType_int_4_1(link);
5859371c9d4SSatish Balay     else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link);
5869371c9d4SSatish Balay     else if (nInt == 2) PackInit_IntegerType_int_2_1(link);
5879371c9d4SSatish Balay     else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link);
5889371c9d4SSatish Balay     else if (nInt == 1) PackInit_IntegerType_int_1_1(link);
5899371c9d4SSatish Balay     else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link);
590b23bfdefSJunchao Zhang     link->bs        = nInt;
591eb02082bSJunchao Zhang     link->unitbytes = nInt * sizeof(int);
592b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
5939371c9d4SSatish Balay     if (link->bs == 1) {
5949371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
5959371c9d4SSatish Balay       link->unit      = MPI_INT;
5969371c9d4SSatish Balay     }
597b23bfdefSJunchao Zhang #endif
598b23bfdefSJunchao Zhang   } else if (nSignedChar) {
5999371c9d4SSatish Balay     if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link);
6009371c9d4SSatish Balay     else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
6019371c9d4SSatish Balay     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link);
6029371c9d4SSatish Balay     else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
6039371c9d4SSatish Balay     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link);
6049371c9d4SSatish Balay     else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
6059371c9d4SSatish Balay     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link);
6069371c9d4SSatish Balay     else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
607b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
608eb02082bSJunchao Zhang     link->unitbytes = nSignedChar * sizeof(SignedChar);
609b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
6109371c9d4SSatish Balay     if (link->bs == 1) {
6119371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
6129371c9d4SSatish Balay       link->unit      = MPI_SIGNED_CHAR;
6139371c9d4SSatish Balay     }
614b23bfdefSJunchao Zhang   } else if (nUnsignedChar) {
6159371c9d4SSatish Balay     if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link);
6169371c9d4SSatish Balay     else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
6179371c9d4SSatish Balay     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link);
6189371c9d4SSatish Balay     else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
6199371c9d4SSatish Balay     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link);
6209371c9d4SSatish Balay     else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
6219371c9d4SSatish Balay     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link);
6229371c9d4SSatish Balay     else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
623b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
624eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar * sizeof(UnsignedChar);
625b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
6269371c9d4SSatish Balay     if (link->bs == 1) {
6279371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
6289371c9d4SSatish Balay       link->unit      = MPI_UNSIGNED_CHAR;
6299371c9d4SSatish Balay     }
63040e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
631b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
6329371c9d4SSatish Balay     if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link);
6339371c9d4SSatish Balay     else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
6349371c9d4SSatish Balay     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link);
6359371c9d4SSatish Balay     else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
6369371c9d4SSatish Balay     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link);
6379371c9d4SSatish Balay     else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
6389371c9d4SSatish Balay     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link);
6399371c9d4SSatish Balay     else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
640b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
641eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex * sizeof(PetscComplex);
64240e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
6439371c9d4SSatish Balay     if (link->bs == 1) {
6449371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
6459371c9d4SSatish Balay       link->unit      = MPIU_COMPLEX;
6469371c9d4SSatish Balay     }
64740e23c03SJunchao Zhang #endif
64840e23c03SJunchao Zhang   } else {
649b23bfdefSJunchao Zhang     MPI_Aint lb, nbyte;
6509566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte));
65108401ef6SPierre Jolivet     PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
652eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
6539371c9d4SSatish Balay       if (nbyte == 4) PackInit_DumbType_char_4_1(link);
6549371c9d4SSatish Balay       else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link);
6559371c9d4SSatish Balay       else if (nbyte == 2) PackInit_DumbType_char_2_1(link);
6569371c9d4SSatish Balay       else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link);
6579371c9d4SSatish Balay       else if (nbyte == 1) PackInit_DumbType_char_1_1(link);
6589371c9d4SSatish Balay       else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link);
659eb02082bSJunchao Zhang       link->bs        = nbyte;
660b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
661b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
66240e23c03SJunchao Zhang     } else {
663eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
6649371c9d4SSatish Balay       if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link);
6659371c9d4SSatish Balay       else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link);
6669371c9d4SSatish Balay       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link);
6679371c9d4SSatish Balay       else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link);
6689371c9d4SSatish Balay       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link);
6699371c9d4SSatish Balay       else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link);
6709371c9d4SSatish Balay       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link);
6719371c9d4SSatish Balay       else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link);
672eb02082bSJunchao Zhang       link->bs        = nInt;
673b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
67440e23c03SJunchao Zhang       link->basicunit = MPI_INT;
67540e23c03SJunchao Zhang     }
6765ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
67740e23c03SJunchao Zhang   }
678b23bfdefSJunchao Zhang 
6799566063dSJacob Faibussowitsch   if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit, &link->unit));
68020c24465SJunchao Zhang 
68120c24465SJunchao Zhang   link->Memcpy = PetscSFLinkMemcpy_Host;
6823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
68340e23c03SJunchao Zhang }
68440e23c03SJunchao Zhang 
685d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *))
686d71ae5a4SJacob Faibussowitsch {
68740e23c03SJunchao Zhang   PetscFunctionBegin;
68840e23c03SJunchao Zhang   *UnpackAndOp = NULL;
68971438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {
69083df288dSJunchao Zhang     if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
691eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
692eb02082bSJunchao Zhang     else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
693eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
694eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
695eb02082bSJunchao Zhang     else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
696eb02082bSJunchao Zhang     else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
697eb02082bSJunchao Zhang     else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
698eb02082bSJunchao Zhang     else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
699eb02082bSJunchao Zhang     else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
700eb02082bSJunchao Zhang     else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
701eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
702eb02082bSJunchao Zhang     else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
703eb02082bSJunchao Zhang   }
7047fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
70571438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) {
70683df288dSJunchao Zhang     if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
707eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
708eb02082bSJunchao Zhang     else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
709eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
710eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
711eb02082bSJunchao Zhang     else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
712eb02082bSJunchao Zhang     else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
713eb02082bSJunchao Zhang     else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
714eb02082bSJunchao Zhang     else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
715eb02082bSJunchao Zhang     else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
716eb02082bSJunchao Zhang     else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
717eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
718eb02082bSJunchao Zhang     else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
71971438e86SJunchao Zhang   } else if (PetscMemTypeDevice(mtype) && atomic) {
72083df288dSJunchao Zhang     if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
721eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
722eb02082bSJunchao Zhang     else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
723eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
724eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
725eb02082bSJunchao Zhang     else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
726eb02082bSJunchao Zhang     else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
727eb02082bSJunchao Zhang     else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
728eb02082bSJunchao Zhang     else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
729eb02082bSJunchao Zhang     else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
730eb02082bSJunchao Zhang     else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
731eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
732eb02082bSJunchao Zhang     else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
733eb02082bSJunchao Zhang   }
734eb02082bSJunchao Zhang #endif
7353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
73640e23c03SJunchao Zhang }
73740e23c03SJunchao Zhang 
738d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *))
739d71ae5a4SJacob Faibussowitsch {
74040e23c03SJunchao Zhang   PetscFunctionBegin;
741cd620004SJunchao Zhang   *ScatterAndOp = NULL;
74271438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {
74383df288dSJunchao Zhang     if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
744cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
745cd620004SJunchao Zhang     else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
746cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
747cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
748cd620004SJunchao Zhang     else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
749cd620004SJunchao Zhang     else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
750cd620004SJunchao Zhang     else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
751cd620004SJunchao Zhang     else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
752cd620004SJunchao Zhang     else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
753cd620004SJunchao Zhang     else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
754cd620004SJunchao Zhang     else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
755cd620004SJunchao Zhang     else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
756eb02082bSJunchao Zhang   }
7577fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
75871438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) {
75983df288dSJunchao Zhang     if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
760cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
761cd620004SJunchao Zhang     else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
762cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
763cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
764cd620004SJunchao Zhang     else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
765cd620004SJunchao Zhang     else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
766cd620004SJunchao Zhang     else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
767cd620004SJunchao Zhang     else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
768cd620004SJunchao Zhang     else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
769cd620004SJunchao Zhang     else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
770cd620004SJunchao Zhang     else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
771cd620004SJunchao Zhang     else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
77271438e86SJunchao Zhang   } else if (PetscMemTypeDevice(mtype) && atomic) {
77383df288dSJunchao Zhang     if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
774cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
775cd620004SJunchao Zhang     else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
776cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
777cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
778cd620004SJunchao Zhang     else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
779cd620004SJunchao Zhang     else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
780cd620004SJunchao Zhang     else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
781cd620004SJunchao Zhang     else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
782cd620004SJunchao Zhang     else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
783cd620004SJunchao Zhang     else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
784cd620004SJunchao Zhang     else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
785cd620004SJunchao Zhang     else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
786eb02082bSJunchao Zhang   }
787eb02082bSJunchao Zhang #endif
7883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
789cd620004SJunchao Zhang }
790cd620004SJunchao Zhang 
791d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *))
792d71ae5a4SJacob Faibussowitsch {
793cd620004SJunchao Zhang   PetscFunctionBegin;
794cd620004SJunchao Zhang   *FetchAndOp = NULL;
795c9cc58a2SBarry Smith   PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
79671438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
7977fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
79871438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
79971438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd;
800cd620004SJunchao Zhang #endif
8013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
802cd620004SJunchao Zhang }
803cd620004SJunchao Zhang 
804d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *))
805d71ae5a4SJacob Faibussowitsch {
806cd620004SJunchao Zhang   PetscFunctionBegin;
807cd620004SJunchao Zhang   *FetchAndOpLocal = NULL;
808c9cc58a2SBarry Smith   PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
80971438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
8107fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
81171438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
81271438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
813cd620004SJunchao Zhang #endif
8143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
815cd620004SJunchao Zhang }
816cd620004SJunchao Zhang 
817d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
818d71ae5a4SJacob Faibussowitsch {
819cd620004SJunchao Zhang   PetscLogDouble flops;
820cd620004SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
821cd620004SJunchao Zhang 
822cd620004SJunchao Zhang   PetscFunctionBegin;
82383df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
824cd620004SJunchao Zhang     flops = bas->rootbuflen[scope] * link->bs;               /* # of roots in buffer x # of scalars in unit */
8257fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
8269371c9d4SSatish Balay     if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops));
8279371c9d4SSatish Balay     else
828cd620004SJunchao Zhang #endif
8299566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(flops));
830cd620004SJunchao Zhang   }
8313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
832cd620004SJunchao Zhang }
833cd620004SJunchao Zhang 
834d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
835d71ae5a4SJacob Faibussowitsch {
836cd620004SJunchao Zhang   PetscLogDouble flops;
837cd620004SJunchao Zhang 
838cd620004SJunchao Zhang   PetscFunctionBegin;
83983df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
840cd620004SJunchao Zhang     flops = sf->leafbuflen[scope] * link->bs;                /* # of roots in buffer x # of scalars in unit */
8417fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
8429371c9d4SSatish Balay     if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops));
8439371c9d4SSatish Balay     else
844cd620004SJunchao Zhang #endif
8459566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(flops));
846cd620004SJunchao Zhang   }
8473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
848cd620004SJunchao Zhang }
849cd620004SJunchao Zhang 
850cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
8514165533cSJose E. Roman   Input Parameters:
852cd620004SJunchao Zhang   +sf      - The StarForest
853cd620004SJunchao Zhang   .link    - The link
854cd620004SJunchao Zhang   .count   - Number of entries to unpack
855145b44c9SPierre Jolivet   .start   - The first index, significant when indices=NULL
856cd620004SJunchao Zhang   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
857cd620004SJunchao Zhang   .buf     - A contiguous buffer to unpack from
858cd620004SJunchao Zhang   -op      - Operation after unpack
859cd620004SJunchao Zhang 
8604165533cSJose E. Roman   Output Parameters:
861cd620004SJunchao Zhang   .data    - The data to unpack to
862cd620004SJunchao Zhang */
863d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op)
864d71ae5a4SJacob Faibussowitsch {
865cd620004SJunchao Zhang   PetscFunctionBegin;
866cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
867cd620004SJunchao Zhang   {
868cd620004SJunchao Zhang     PetscInt i;
869cd620004SJunchao Zhang     if (indices) {
870cd620004SJunchao Zhang       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
871cd620004SJunchao Zhang          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
872cd620004SJunchao Zhang       */
8739566063dSJacob Faibussowitsch       for (i = 0; i < count; i++) PetscCallMPI(MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op));
874cd620004SJunchao Zhang     } else {
8759566063dSJacob Faibussowitsch       PetscCallMPI(MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op));
876cd620004SJunchao Zhang     }
877cd620004SJunchao Zhang   }
8783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
879cd620004SJunchao Zhang #else
880cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
881cd620004SJunchao Zhang #endif
882cd620004SJunchao Zhang }
883cd620004SJunchao Zhang 
884d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op)
885d71ae5a4SJacob Faibussowitsch {
886cd620004SJunchao Zhang   PetscFunctionBegin;
887cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
888cd620004SJunchao Zhang   {
889cd620004SJunchao Zhang     PetscInt i, disp;
890fcc7397dSJunchao Zhang     if (!srcIdx) {
8919566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op));
892fcc7397dSJunchao Zhang     } else {
893cd620004SJunchao Zhang       for (i = 0; i < count; i++) {
894fcc7397dSJunchao Zhang         disp = dstIdx ? dstIdx[i] : dstStart + i;
8959566063dSJacob Faibussowitsch         PetscCallMPI(MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op));
896fcc7397dSJunchao Zhang       }
897cd620004SJunchao Zhang     }
898cd620004SJunchao Zhang   }
8993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
900cd620004SJunchao Zhang #else
901cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
902cd620004SJunchao Zhang #endif
903cd620004SJunchao Zhang }
904cd620004SJunchao Zhang 
905cd620004SJunchao Zhang /*=============================================================================
906cd620004SJunchao Zhang               Pack/Unpack/Fetch/Scatter routines
907cd620004SJunchao Zhang  ============================================================================*/
908cd620004SJunchao Zhang 
909cd620004SJunchao Zhang /* Pack rootdata to rootbuf
9104165533cSJose E. Roman   Input Parameters:
911cd620004SJunchao Zhang   + sf       - The SF this packing works on.
912cd620004SJunchao Zhang   . link     - It gives the memtype of the roots and also provides root buffer.
913cd620004SJunchao Zhang   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
914cd620004SJunchao Zhang   - rootdata - Where to read the roots.
915cd620004SJunchao Zhang 
916cd620004SJunchao Zhang   Notes:
917cd620004SJunchao Zhang   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
91871438e86SJunchao Zhang   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
919cd620004SJunchao Zhang  */
92066976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
921d71ae5a4SJacob Faibussowitsch {
922cd620004SJunchao Zhang   const PetscInt *rootindices = NULL;
923cd620004SJunchao Zhang   PetscInt        count, start;
924fcc7397dSJunchao Zhang   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
925cd620004SJunchao Zhang   PetscMemType   rootmtype                                                                                        = link->rootmtype;
926fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                              = NULL;
927fcc7397dSJunchao Zhang 
928cd620004SJunchao Zhang   PetscFunctionBegin;
9299566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
93071438e86SJunchao Zhang   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
9319566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
9329566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetPack(link, rootmtype, &Pack));
9339566063dSJacob Faibussowitsch     PetscCall((*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
934cd620004SJunchao Zhang   }
9359566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
9363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
937cd620004SJunchao Zhang }
938cd620004SJunchao Zhang 
939cd620004SJunchao Zhang /* Pack leafdata to leafbuf */
94066976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
941d71ae5a4SJacob Faibussowitsch {
942cd620004SJunchao Zhang   const PetscInt *leafindices = NULL;
943cd620004SJunchao Zhang   PetscInt        count, start;
944fcc7397dSJunchao Zhang   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
945cd620004SJunchao Zhang   PetscMemType   leafmtype                                                                                        = link->leafmtype;
946fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                              = NULL;
947cd620004SJunchao Zhang 
948cd620004SJunchao Zhang   PetscFunctionBegin;
9499566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
95071438e86SJunchao Zhang   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
9519566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
9529566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetPack(link, leafmtype, &Pack));
9539566063dSJacob Faibussowitsch     PetscCall((*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
954cd620004SJunchao Zhang   }
9559566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
9563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
957cd620004SJunchao Zhang }
958cd620004SJunchao Zhang 
95971438e86SJunchao Zhang /* Pack rootdata to rootbuf, which are in the same memory space */
960d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
961d71ae5a4SJacob Faibussowitsch {
96271438e86SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
96371438e86SJunchao Zhang 
96471438e86SJunchao Zhang   PetscFunctionBegin;
96571438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
9669566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
9679566063dSJacob Faibussowitsch     if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
96871438e86SJunchao Zhang   }
9699566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
9709566063dSJacob Faibussowitsch   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf, link, scope, rootdata));
9719566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
9723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
97371438e86SJunchao Zhang }
97471438e86SJunchao Zhang /* Pack leafdata to leafbuf, which are in the same memory space */
975d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
976d71ae5a4SJacob Faibussowitsch {
97771438e86SJunchao Zhang   PetscFunctionBegin;
97871438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
9799566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
9809566063dSJacob Faibussowitsch     if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
98171438e86SJunchao Zhang   }
9829566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
9839566063dSJacob Faibussowitsch   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata));
9849566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
9853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
98671438e86SJunchao Zhang }
98771438e86SJunchao Zhang 
98866976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
989d71ae5a4SJacob Faibussowitsch {
990cd620004SJunchao Zhang   const PetscInt *rootindices = NULL;
991cd620004SJunchao Zhang   PetscInt        count, start;
992cd620004SJunchao Zhang   PetscSF_Basic  *bas                                                                                                    = (PetscSF_Basic *)sf->data;
993fcc7397dSJunchao Zhang   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
994cd620004SJunchao Zhang   PetscMemType   rootmtype                                                                                               = link->rootmtype;
995fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                                     = NULL;
996cd620004SJunchao Zhang 
997cd620004SJunchao Zhang   PetscFunctionBegin;
99871438e86SJunchao Zhang   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
9999566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp));
1000cd620004SJunchao Zhang     if (UnpackAndOp) {
10019566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
10029566063dSJacob Faibussowitsch       PetscCall((*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
1003cd620004SJunchao Zhang     } else {
10049566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices));
10059566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op));
1006cd620004SJunchao Zhang     }
1007cd620004SJunchao Zhang   }
10089566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op));
10093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1010cd620004SJunchao Zhang }
1011cd620004SJunchao Zhang 
101266976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1013d71ae5a4SJacob Faibussowitsch {
1014cd620004SJunchao Zhang   const PetscInt *leafindices = NULL;
1015cd620004SJunchao Zhang   PetscInt        count, start;
1016fcc7397dSJunchao Zhang   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1017cd620004SJunchao Zhang   PetscMemType   leafmtype                                                                                               = link->leafmtype;
1018fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                                     = NULL;
1019cd620004SJunchao Zhang 
1020cd620004SJunchao Zhang   PetscFunctionBegin;
102171438e86SJunchao Zhang   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
10229566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp));
1023cd620004SJunchao Zhang     if (UnpackAndOp) {
10249566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
10259566063dSJacob Faibussowitsch       PetscCall((*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
1026cd620004SJunchao Zhang     } else {
10279566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices));
10289566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op));
1029cd620004SJunchao Zhang     }
1030cd620004SJunchao Zhang   }
10319566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op));
10323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
103371438e86SJunchao Zhang }
103471438e86SJunchao Zhang /* Unpack rootbuf to rootdata, which are in the same memory space */
1035d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1036d71ae5a4SJacob Faibussowitsch {
103771438e86SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
103871438e86SJunchao Zhang 
103971438e86SJunchao Zhang   PetscFunctionBegin;
10409566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
10419566063dSJacob Faibussowitsch   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op));
10429566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
104371438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
10449566063dSJacob Faibussowitsch     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
10459566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
104671438e86SJunchao Zhang   }
10473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1048cd620004SJunchao Zhang }
1049cd620004SJunchao Zhang 
105071438e86SJunchao Zhang /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1051d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1052d71ae5a4SJacob Faibussowitsch {
105371438e86SJunchao Zhang   PetscFunctionBegin;
10549566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
10559566063dSJacob Faibussowitsch   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op));
10569566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
105771438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
10589566063dSJacob Faibussowitsch     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
10599566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
106071438e86SJunchao Zhang   }
10613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
106271438e86SJunchao Zhang }
106371438e86SJunchao Zhang 
106471438e86SJunchao Zhang /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1065d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op)
1066d71ae5a4SJacob Faibussowitsch {
1067cd620004SJunchao Zhang   const PetscInt *rootindices = NULL;
1068cd620004SJunchao Zhang   PetscInt        count, start;
1069cd620004SJunchao Zhang   PetscSF_Basic  *bas                                                                                             = (PetscSF_Basic *)sf->data;
1070fcc7397dSJunchao Zhang   PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL;
1071cd620004SJunchao Zhang   PetscMemType   rootmtype                                                                                        = link->rootmtype;
1072fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                              = NULL;
1073cd620004SJunchao Zhang 
1074cd620004SJunchao Zhang   PetscFunctionBegin;
10759566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
107671438e86SJunchao Zhang   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1077cd620004SJunchao Zhang     /* Do FetchAndOp on rootdata with rootbuf */
10789566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp));
10799566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices));
10809566063dSJacob Faibussowitsch     PetscCall((*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype]));
1081cd620004SJunchao Zhang   }
10829566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op));
10839566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
10843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1085cd620004SJunchao Zhang }
1086cd620004SJunchao Zhang 
1087d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op)
1088d71ae5a4SJacob Faibussowitsch {
1089cd620004SJunchao Zhang   const PetscInt *rootindices = NULL, *leafindices = NULL;
1090cd620004SJunchao Zhang   PetscInt        count, rootstart, leafstart;
1091cd620004SJunchao Zhang   PetscSF_Basic  *bas                                                                                                                                                 = (PetscSF_Basic *)sf->data;
1092fcc7397dSJunchao Zhang   PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL;
109371438e86SJunchao Zhang   PetscMemType   rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype;
1094fcc7397dSJunchao Zhang   PetscSFPackOpt leafopt = NULL, rootopt = NULL;
109571438e86SJunchao Zhang   PetscInt       buflen = sf->leafbuflen[PETSCSF_LOCAL];
109671438e86SJunchao Zhang   char          *srcbuf = NULL, *dstbuf = NULL;
109771438e86SJunchao Zhang   PetscBool      dstdups;
1098cd620004SJunchao Zhang 
1099362febeeSStefano Zampini   PetscFunctionBegin;
11003ba16761SJacob Faibussowitsch   if (!buflen) PetscFunctionReturn(PETSC_SUCCESS);
110171438e86SJunchao Zhang   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
110271438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
11039566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata));
110471438e86SJunchao Zhang       srcmtype = rootmtype;
110571438e86SJunchao Zhang       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
110671438e86SJunchao Zhang       dstmtype = leafmtype;
110771438e86SJunchao Zhang       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
110871438e86SJunchao Zhang     } else {
11099566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata));
111071438e86SJunchao Zhang       srcmtype = leafmtype;
111171438e86SJunchao Zhang       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
111271438e86SJunchao Zhang       dstmtype = rootmtype;
111371438e86SJunchao Zhang       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
111471438e86SJunchao Zhang     }
11159566063dSJacob Faibussowitsch     PetscCall((*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes));
111671438e86SJunchao Zhang     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
11179566063dSJacob Faibussowitsch     if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link));
111871438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
11199566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op));
1120cd620004SJunchao Zhang     } else {
11219566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op));
112271438e86SJunchao Zhang     }
112371438e86SJunchao Zhang   } else {
112471438e86SJunchao Zhang     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
112571438e86SJunchao Zhang     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
11269566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp));
1127cd620004SJunchao Zhang     if (ScatterAndOp) {
11289566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
11299566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
113071438e86SJunchao Zhang       if (direction == PETSCSF_ROOT2LEAF) {
11319566063dSJacob Faibussowitsch         PetscCall((*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata));
1132cd620004SJunchao Zhang       } else {
11339566063dSJacob Faibussowitsch         PetscCall((*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata));
113471438e86SJunchao Zhang       }
1135cd620004SJunchao Zhang     } else {
11369566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
11379566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
113871438e86SJunchao Zhang       if (direction == PETSCSF_ROOT2LEAF) {
11399566063dSJacob Faibussowitsch         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op));
114071438e86SJunchao Zhang       } else {
11419566063dSJacob Faibussowitsch         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op));
1142cd620004SJunchao Zhang       }
1143cd620004SJunchao Zhang     }
114471438e86SJunchao Zhang   }
11453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1146cd620004SJunchao Zhang }
1147cd620004SJunchao Zhang 
1148cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */
1149d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1150d71ae5a4SJacob Faibussowitsch {
1151cd620004SJunchao Zhang   const PetscInt *rootindices = NULL, *leafindices = NULL;
1152cd620004SJunchao Zhang   PetscInt        count, rootstart, leafstart;
1153cd620004SJunchao Zhang   PetscSF_Basic  *bas                                                                                                                                                            = (PetscSF_Basic *)sf->data;
1154fcc7397dSJunchao Zhang   PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1155cd620004SJunchao Zhang   const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype;
1156fcc7397dSJunchao Zhang   PetscSFPackOpt     leafopt = NULL, rootopt = NULL;
1157cd620004SJunchao Zhang 
1158cd620004SJunchao Zhang   PetscFunctionBegin;
11593ba16761SJacob Faibussowitsch   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(PETSC_SUCCESS);
1160cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1161cd620004SJunchao Zhang     /* The local communication has to go through pack and unpack */
1162cd620004SJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1163cd620004SJunchao Zhang   } else {
11649566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
11659566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
11669566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal));
11679566063dSJacob Faibussowitsch     PetscCall((*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate));
1168cd620004SJunchao Zhang   }
11693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
117040e23c03SJunchao Zhang }
117140e23c03SJunchao Zhang 
117240e23c03SJunchao Zhang /*
1173cd620004SJunchao Zhang   Create per-rank pack/unpack optimizations based on indice patterns
117440e23c03SJunchao Zhang 
117540e23c03SJunchao Zhang    Input Parameters:
1176fcc7397dSJunchao Zhang   +  n       - Number of destination ranks
1177eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1178b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
117940e23c03SJunchao Zhang 
118040e23c03SJunchao Zhang    Output Parameters:
1181cd620004SJunchao Zhang   +  opt     - Pack optimizations. NULL if no optimizations.
118240e23c03SJunchao Zhang */
118366976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out)
1184d71ae5a4SJacob Faibussowitsch {
1185fcc7397dSJunchao Zhang   PetscInt       r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y;
1186fcc7397dSJunchao Zhang   PetscBool      optimizable = PETSC_TRUE;
118740e23c03SJunchao Zhang   PetscSFPackOpt opt;
118840e23c03SJunchao Zhang 
118940e23c03SJunchao Zhang   PetscFunctionBegin;
11909566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1, &opt));
11919566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(7 * n + 2, &opt->array));
1192fcc7397dSJunchao Zhang   opt->n = opt->array[0] = n;
1193fcc7397dSJunchao Zhang   opt->offset            = opt->array + 1;
1194fcc7397dSJunchao Zhang   opt->start             = opt->array + n + 2;
1195fcc7397dSJunchao Zhang   opt->dx                = opt->array + 2 * n + 2;
1196fcc7397dSJunchao Zhang   opt->dy                = opt->array + 3 * n + 2;
1197fcc7397dSJunchao Zhang   opt->dz                = opt->array + 4 * n + 2;
1198fcc7397dSJunchao Zhang   opt->X                 = opt->array + 5 * n + 2;
1199fcc7397dSJunchao Zhang   opt->Y                 = opt->array + 6 * n + 2;
1200fcc7397dSJunchao Zhang 
1201fcc7397dSJunchao Zhang   for (r = 0; r < n; r++) {            /* For each destination rank */
1202fcc7397dSJunchao Zhang     m     = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1203fcc7397dSJunchao Zhang     p     = offset[r];
1204fcc7397dSJunchao Zhang     start = idx[p]; /* First index for this rank */
1205fcc7397dSJunchao Zhang     p++;
1206fcc7397dSJunchao Zhang 
1207fcc7397dSJunchao Zhang     /* Search in X dimension */
1208fcc7397dSJunchao Zhang     for (dx = 1; dx < m; dx++, p++) {
1209fcc7397dSJunchao Zhang       if (start + dx != idx[p]) break;
1210b23bfdefSJunchao Zhang     }
1211b23bfdefSJunchao Zhang 
1212fcc7397dSJunchao Zhang     dydz = m / dx;
1213fcc7397dSJunchao Zhang     X    = dydz > 1 ? (idx[p] - start) : dx;
1214fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
12159371c9d4SSatish Balay     if (m % dx || X <= 0) {
12169371c9d4SSatish Balay       optimizable = PETSC_FALSE;
12179371c9d4SSatish Balay       goto finish;
12189371c9d4SSatish Balay     }
1219fcc7397dSJunchao Zhang     for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */
1220fcc7397dSJunchao Zhang       for (i = 0; i < dx; i++, p++) {
1221fcc7397dSJunchao Zhang         if (start + X * dy + i != idx[p]) {
12229371c9d4SSatish Balay           if (i) {
12239371c9d4SSatish Balay             optimizable = PETSC_FALSE;
12249371c9d4SSatish Balay             goto finish;
12259371c9d4SSatish Balay           } /* The pattern is violated in the middle of an x-walk */
12269371c9d4SSatish Balay           else
12279371c9d4SSatish Balay             goto Z_dimension;
122840e23c03SJunchao Zhang         }
122940e23c03SJunchao Zhang       }
123040e23c03SJunchao Zhang     }
123140e23c03SJunchao Zhang 
1232fcc7397dSJunchao Zhang   Z_dimension:
1233fcc7397dSJunchao Zhang     dz = m / (dx * dy);
1234fcc7397dSJunchao Zhang     Y  = dz > 1 ? (idx[p] - start) / X : dy;
1235fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
12369371c9d4SSatish Balay     if (m % (dx * dy) || Y <= 0) {
12379371c9d4SSatish Balay       optimizable = PETSC_FALSE;
12389371c9d4SSatish Balay       goto finish;
12399371c9d4SSatish Balay     }
1240fcc7397dSJunchao Zhang     for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1241fcc7397dSJunchao Zhang       for (j = 0; j < dy; j++) {
1242fcc7397dSJunchao Zhang         for (i = 0; i < dx; i++, p++) {
12439371c9d4SSatish Balay           if (start + X * Y * k + X * j + i != idx[p]) {
12449371c9d4SSatish Balay             optimizable = PETSC_FALSE;
12459371c9d4SSatish Balay             goto finish;
12469371c9d4SSatish Balay           }
124740e23c03SJunchao Zhang         }
124840e23c03SJunchao Zhang       }
124940e23c03SJunchao Zhang     }
1250fcc7397dSJunchao Zhang     opt->start[r] = start;
1251fcc7397dSJunchao Zhang     opt->dx[r]    = dx;
1252fcc7397dSJunchao Zhang     opt->dy[r]    = dy;
1253fcc7397dSJunchao Zhang     opt->dz[r]    = dz;
1254fcc7397dSJunchao Zhang     opt->X[r]     = X;
1255fcc7397dSJunchao Zhang     opt->Y[r]     = Y;
125640e23c03SJunchao Zhang   }
125740e23c03SJunchao Zhang 
1258fcc7397dSJunchao Zhang finish:
1259fcc7397dSJunchao Zhang   /* If not optimizable, free arrays to save memory */
1260fcc7397dSJunchao Zhang   if (!n || !optimizable) {
12619566063dSJacob Faibussowitsch     PetscCall(PetscFree(opt->array));
12629566063dSJacob Faibussowitsch     PetscCall(PetscFree(opt));
126340e23c03SJunchao Zhang     *out = NULL;
1264fcc7397dSJunchao Zhang   } else {
1265fcc7397dSJunchao Zhang     opt->offset[0] = 0;
1266fcc7397dSJunchao Zhang     for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r];
1267fcc7397dSJunchao Zhang     *out = opt;
1268fcc7397dSJunchao Zhang   }
12693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
127040e23c03SJunchao Zhang }
127140e23c03SJunchao Zhang 
1272d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out)
1273d71ae5a4SJacob Faibussowitsch {
127440e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
127540e23c03SJunchao Zhang 
127640e23c03SJunchao Zhang   PetscFunctionBegin;
127740e23c03SJunchao Zhang   if (opt) {
12789566063dSJacob Faibussowitsch     PetscCall(PetscSFFree(sf, mtype, opt->array));
12799566063dSJacob Faibussowitsch     PetscCall(PetscFree(opt));
128040e23c03SJunchao Zhang     *out = NULL;
128140e23c03SJunchao Zhang   }
12823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
128340e23c03SJunchao Zhang }
1284cd620004SJunchao Zhang 
1285d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1286d71ae5a4SJacob Faibussowitsch {
1287cd620004SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1288cd620004SJunchao Zhang   PetscInt       i, j;
1289cd620004SJunchao Zhang 
1290cd620004SJunchao Zhang   PetscFunctionBegin;
1291cd620004SJunchao Zhang   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1292cd620004SJunchao Zhang   for (i = 0; i < 2; i++) { /* Set defaults */
1293cd620004SJunchao Zhang     sf->leafstart[i]   = 0;
1294cd620004SJunchao Zhang     sf->leafcontig[i]  = PETSC_TRUE;
1295cd620004SJunchao Zhang     sf->leafdups[i]    = PETSC_FALSE;
1296cd620004SJunchao Zhang     bas->rootstart[i]  = 0;
1297cd620004SJunchao Zhang     bas->rootcontig[i] = PETSC_TRUE;
1298cd620004SJunchao Zhang     bas->rootdups[i]   = PETSC_FALSE;
1299cd620004SJunchao Zhang   }
1300cd620004SJunchao Zhang 
1301cd620004SJunchao Zhang   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1302cd620004SJunchao Zhang   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1303cd620004SJunchao Zhang 
1304cd620004SJunchao Zhang   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1305cd620004SJunchao Zhang   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1306cd620004SJunchao Zhang 
1307cd620004SJunchao Zhang   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1308cd620004SJunchao Zhang   for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */
13099371c9d4SSatish Balay     if (sf->rmine[i] != sf->leafstart[0] + i) {
13109371c9d4SSatish Balay       sf->leafcontig[0] = PETSC_FALSE;
13119371c9d4SSatish Balay       break;
13129371c9d4SSatish Balay     }
1313cd620004SJunchao Zhang   }
1314cd620004SJunchao Zhang   for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */
13159371c9d4SSatish Balay     if (sf->rmine[i] != sf->leafstart[1] + j) {
13169371c9d4SSatish Balay       sf->leafcontig[1] = PETSC_FALSE;
13179371c9d4SSatish Balay       break;
13189371c9d4SSatish Balay     }
1319cd620004SJunchao Zhang   }
1320cd620004SJunchao Zhang 
1321cd620004SJunchao Zhang   /* If not, see if we can have per-rank optimizations by doing index analysis */
13229566063dSJacob Faibussowitsch   if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]));
13239566063dSJacob Faibussowitsch   if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1]));
1324cd620004SJunchao Zhang 
1325cd620004SJunchao Zhang   /* Are root indices for self and remote contiguous? */
1326cd620004SJunchao Zhang   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1327cd620004SJunchao Zhang   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1328cd620004SJunchao Zhang 
1329cd620004SJunchao Zhang   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1330cd620004SJunchao Zhang   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1331cd620004SJunchao Zhang 
1332cd620004SJunchao Zhang   for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) {
13339371c9d4SSatish Balay     if (bas->irootloc[i] != bas->rootstart[0] + i) {
13349371c9d4SSatish Balay       bas->rootcontig[0] = PETSC_FALSE;
13359371c9d4SSatish Balay       break;
13369371c9d4SSatish Balay     }
1337cd620004SJunchao Zhang   }
1338cd620004SJunchao Zhang   for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) {
13399371c9d4SSatish Balay     if (bas->irootloc[i] != bas->rootstart[1] + j) {
13409371c9d4SSatish Balay       bas->rootcontig[1] = PETSC_FALSE;
13419371c9d4SSatish Balay       break;
13429371c9d4SSatish Balay     }
1343cd620004SJunchao Zhang   }
1344cd620004SJunchao Zhang 
13459566063dSJacob Faibussowitsch   if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]));
13469566063dSJacob Faibussowitsch   if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]));
1347cd620004SJunchao Zhang 
1348cd620004SJunchao Zhang   /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1349013b3241SStefano Zampini   if (PetscDefined(HAVE_DEVICE)) {
1350013b3241SStefano Zampini     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
13519566063dSJacob Faibussowitsch     if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]));
13529566063dSJacob Faibussowitsch     if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine + sf->roffset[sf->ndranks], &sf->leafdups[1]));
13539566063dSJacob Faibussowitsch     if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]));
13549566063dSJacob Faibussowitsch     if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc + bas->ioffset[bas->ndiranks], &bas->rootdups[1]));
1355013b3241SStefano Zampini   }
13563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1357cd620004SJunchao Zhang }
1358cd620004SJunchao Zhang 
1359d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1360d71ae5a4SJacob Faibussowitsch {
1361cd620004SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1362cd620004SJunchao Zhang   PetscInt       i;
1363cd620004SJunchao Zhang 
1364cd620004SJunchao Zhang   PetscFunctionBegin;
1365cd620004SJunchao Zhang   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
13669566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i]));
13679566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i]));
13687fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
13699566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i]));
13709566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i]));
13717fd2d3dbSJunchao Zhang #endif
1372cd620004SJunchao Zhang   }
13733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1374cd620004SJunchao Zhang }
1375