146233b44SBarry Smith #include <petsc/private/sfimpl.h>
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang
57fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
67fd2d3dbSJunchao Zhang
740e23c03SJunchao Zhang /*
840e23c03SJunchao Zhang * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
940e23c03SJunchao Zhang * therefore we pack data types manually. This file defines packing routines for the standard data types.
1040e23c03SJunchao Zhang */
1140e23c03SJunchao Zhang
12cd620004SJunchao Zhang #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d
1340e23c03SJunchao Zhang
14cd620004SJunchao Zhang /* Operations working like s += t */
159371c9d4SSatish Balay #define OP_BINARY(op, s, t) \
16d71ae5a4SJacob Faibussowitsch do { \
17d71ae5a4SJacob Faibussowitsch (s) = (s)op(t); \
18d71ae5a4SJacob Faibussowitsch } while (0) /* binary ops in the middle such as +, *, && etc. */
199371c9d4SSatish Balay #define OP_FUNCTION(op, s, t) \
20d71ae5a4SJacob Faibussowitsch do { \
21d71ae5a4SJacob Faibussowitsch (s) = op((s), (t)); \
22d71ae5a4SJacob Faibussowitsch } while (0) /* ops like a function, such as PetscMax, PetscMin */
239371c9d4SSatish Balay #define OP_LXOR(op, s, t) \
24d71ae5a4SJacob Faibussowitsch do { \
25d71ae5a4SJacob Faibussowitsch (s) = (!(s)) != (!(t)); \
26d71ae5a4SJacob Faibussowitsch } while (0) /* logical exclusive OR */
279371c9d4SSatish Balay #define OP_ASSIGN(op, s, t) \
28d71ae5a4SJacob Faibussowitsch do { \
29d71ae5a4SJacob Faibussowitsch (s) = (t); \
30d71ae5a4SJacob Faibussowitsch } while (0)
31cd620004SJunchao Zhang /* Ref MPI MAXLOC */
32cd620004SJunchao Zhang #define OP_XLOC(op, s, t) \
33cd620004SJunchao Zhang do { \
34cd620004SJunchao Zhang if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \
35cd620004SJunchao Zhang else if (!((s).u op(t).u)) s = t; \
36cd620004SJunchao Zhang } while (0)
3740e23c03SJunchao Zhang
3840e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
3940e23c03SJunchao Zhang
4040e23c03SJunchao Zhang Arguments of the macro:
41b23bfdefSJunchao Zhang +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
42fcc7397dSJunchao Zhang .BS Block size for vectorization. It is a factor of bsz.
43b23bfdefSJunchao Zhang -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
4440e23c03SJunchao Zhang
4540e23c03SJunchao Zhang Arguments of the Pack routine:
46cd620004SJunchao Zhang +count Number of indices in idx[].
47fcc7397dSJunchao Zhang .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
48fcc7397dSJunchao Zhang .opt Per-pack optimization plan. NULL means no such plan.
49fcc7397dSJunchao Zhang .idx Indices of entries to packed.
50eb02082bSJunchao Zhang .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
51cd620004SJunchao Zhang .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
52cd620004SJunchao Zhang -packed Address of the packed data.
5340e23c03SJunchao Zhang */
54b23bfdefSJunchao Zhang #define DEF_PackFunc(Type, BS, EQ) \
55d71ae5a4SJacob Faibussowitsch static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) \
56d71ae5a4SJacob Faibussowitsch { \
57b23bfdefSJunchao Zhang const Type *u = (const Type *)unpacked, *u2; \
58b23bfdefSJunchao Zhang Type *p = (Type *)packed, *p2; \
59fcc7397dSJunchao Zhang PetscInt i, j, k, X, Y, r, bs = link->bs; \
60fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
61b23bfdefSJunchao Zhang const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
6240e23c03SJunchao Zhang PetscFunctionBegin; \
639566063dSJacob Faibussowitsch if (!idx) PetscCall(PetscArraycpy(p, u + start * MBS, MBS * count)); /* idx[] are contiguous */ \
649371c9d4SSatish Balay else if (opt) { /* has optimizations available */ p2 = p; \
65fcc7397dSJunchao Zhang for (r = 0; r < opt->n; r++) { \
66fcc7397dSJunchao Zhang u2 = u + opt->start[r] * MBS; \
67fcc7397dSJunchao Zhang X = opt->X[r]; \
68fcc7397dSJunchao Zhang Y = opt->Y[r]; \
69fcc7397dSJunchao Zhang for (k = 0; k < opt->dz[r]; k++) \
70fcc7397dSJunchao Zhang for (j = 0; j < opt->dy[r]; j++) { \
719566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS)); \
72fcc7397dSJunchao Zhang p2 += opt->dx[r] * MBS; \
73fcc7397dSJunchao Zhang } \
74fcc7397dSJunchao Zhang } \
75fcc7397dSJunchao Zhang } else { \
76b23bfdefSJunchao Zhang for (i = 0; i < count; i++) \
77eb02082bSJunchao Zhang for (j = 0; j < M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \
78eb02082bSJunchao Zhang for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
79b23bfdefSJunchao Zhang p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \
8040e23c03SJunchao Zhang } \
813ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); \
8240e23c03SJunchao Zhang }
8340e23c03SJunchao Zhang
84cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
85cd620004SJunchao Zhang and inserts into a sparse array.
8640e23c03SJunchao Zhang
8740e23c03SJunchao Zhang Arguments:
88b23bfdefSJunchao Zhang .Type Type of the data
8940e23c03SJunchao Zhang .BS Block size for vectorization
90b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
9140e23c03SJunchao Zhang
9240e23c03SJunchao Zhang Notes:
9340e23c03SJunchao Zhang This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
9440e23c03SJunchao Zhang */
95cd620004SJunchao Zhang #define DEF_UnpackFunc(Type, BS, EQ) \
96d71ae5a4SJacob Faibussowitsch static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
97d71ae5a4SJacob Faibussowitsch { \
98b23bfdefSJunchao Zhang Type *u = (Type *)unpacked, *u2; \
99fcc7397dSJunchao Zhang const Type *p = (const Type *)packed; \
100fcc7397dSJunchao Zhang PetscInt i, j, k, X, Y, r, bs = link->bs; \
101fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
102b23bfdefSJunchao Zhang const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
10340e23c03SJunchao Zhang PetscFunctionBegin; \
104b23bfdefSJunchao Zhang if (!idx) { \
105fcc7397dSJunchao Zhang u += start * MBS; \
1069566063dSJacob Faibussowitsch if (u != p) PetscCall(PetscArraycpy(u, p, count * MBS)); \
107fcc7397dSJunchao Zhang } else if (opt) { /* has optimizations available */ \
108fcc7397dSJunchao Zhang for (r = 0; r < opt->n; r++) { \
109fcc7397dSJunchao Zhang u2 = u + opt->start[r] * MBS; \
110fcc7397dSJunchao Zhang X = opt->X[r]; \
111fcc7397dSJunchao Zhang Y = opt->Y[r]; \
112fcc7397dSJunchao Zhang for (k = 0; k < opt->dz[r]; k++) \
113fcc7397dSJunchao Zhang for (j = 0; j < opt->dy[r]; j++) { \
1149566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS)); \
115fcc7397dSJunchao Zhang p += opt->dx[r] * MBS; \
116fcc7397dSJunchao Zhang } \
117fcc7397dSJunchao Zhang } \
118fcc7397dSJunchao Zhang } else { \
119b23bfdefSJunchao Zhang for (i = 0; i < count; i++) \
120b23bfdefSJunchao Zhang for (j = 0; j < M; j++) \
121cd620004SJunchao Zhang for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \
12240e23c03SJunchao Zhang } \
1233ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); \
12440e23c03SJunchao Zhang }
12540e23c03SJunchao Zhang
126cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
12740e23c03SJunchao Zhang
12840e23c03SJunchao Zhang Arguments:
129cd620004SJunchao Zhang +Opname Name of the Op, such as Add, Mult, LAND, etc.
130b23bfdefSJunchao Zhang .Type Type of the data
13140e23c03SJunchao Zhang .BS Block size for vectorization
132b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
133cd620004SJunchao Zhang .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
134cd620004SJunchao Zhang .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
13540e23c03SJunchao Zhang */
136cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \
137d71ae5a4SJacob Faibussowitsch static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
138d71ae5a4SJacob Faibussowitsch { \
139cd620004SJunchao Zhang Type *u = (Type *)unpacked, *u2; \
140fcc7397dSJunchao Zhang const Type *p = (const Type *)packed; \
141fcc7397dSJunchao Zhang PetscInt i, j, k, X, Y, r, bs = link->bs; \
142fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
143b23bfdefSJunchao Zhang const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
14440e23c03SJunchao Zhang PetscFunctionBegin; \
145b23bfdefSJunchao Zhang if (!idx) { \
146fcc7397dSJunchao Zhang u += start * MBS; \
147cd620004SJunchao Zhang for (i = 0; i < count; i++) \
148cd620004SJunchao Zhang for (j = 0; j < M; j++) \
1499371c9d4SSatish Balay for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
150fcc7397dSJunchao Zhang } else if (opt) { /* idx[] has patterns */ \
151fcc7397dSJunchao Zhang for (r = 0; r < opt->n; r++) { \
152fcc7397dSJunchao Zhang u2 = u + opt->start[r] * MBS; \
153fcc7397dSJunchao Zhang X = opt->X[r]; \
154fcc7397dSJunchao Zhang Y = opt->Y[r]; \
155fcc7397dSJunchao Zhang for (k = 0; k < opt->dz[r]; k++) \
156fcc7397dSJunchao Zhang for (j = 0; j < opt->dy[r]; j++) { \
157fcc7397dSJunchao Zhang for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \
158fcc7397dSJunchao Zhang p += opt->dx[r] * MBS; \
159fcc7397dSJunchao Zhang } \
160fcc7397dSJunchao Zhang } \
161fcc7397dSJunchao Zhang } else { \
162cd620004SJunchao Zhang for (i = 0; i < count; i++) \
163cd620004SJunchao Zhang for (j = 0; j < M; j++) \
1649371c9d4SSatish Balay for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
165cd620004SJunchao Zhang } \
1663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); \
167cd620004SJunchao Zhang }
168cd620004SJunchao Zhang
169cd620004SJunchao Zhang #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \
170d71ae5a4SJacob Faibussowitsch static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) \
171d71ae5a4SJacob Faibussowitsch { \
172fcc7397dSJunchao Zhang Type *u = (Type *)unpacked, *p = (Type *)packed, tmp; \
173fcc7397dSJunchao Zhang PetscInt i, j, k, r, l, bs = link->bs; \
174fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs / BS; \
175fcc7397dSJunchao Zhang const PetscInt MBS = M * BS; \
176cd620004SJunchao Zhang PetscFunctionBegin; \
177fcc7397dSJunchao Zhang for (i = 0; i < count; i++) { \
178fcc7397dSJunchao Zhang r = (!idx ? start + i : idx[i]) * MBS; \
179fcc7397dSJunchao Zhang l = i * MBS; \
180b23bfdefSJunchao Zhang for (j = 0; j < M; j++) \
181b23bfdefSJunchao Zhang for (k = 0; k < BS; k++) { \
182fcc7397dSJunchao Zhang tmp = u[r + j * BS + k]; \
183fcc7397dSJunchao Zhang OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \
184fcc7397dSJunchao Zhang p[l + j * BS + k] = tmp; \
185cd620004SJunchao Zhang } \
186cd620004SJunchao Zhang } \
1873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); \
188cd620004SJunchao Zhang }
189cd620004SJunchao Zhang
190cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \
191d71ae5a4SJacob Faibussowitsch static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) \
192d71ae5a4SJacob Faibussowitsch { \
193fcc7397dSJunchao Zhang const Type *u = (const Type *)src; \
194fcc7397dSJunchao Zhang Type *v = (Type *)dst; \
195fcc7397dSJunchao Zhang PetscInt i, j, k, s, t, X, Y, bs = link->bs; \
196cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs / BS; \
197cd620004SJunchao Zhang const PetscInt MBS = M * BS; \
198cd620004SJunchao Zhang PetscFunctionBegin; \
199fcc7397dSJunchao Zhang if (!srcIdx) { /* src is contiguous */ \
200fcc7397dSJunchao Zhang u += srcStart * MBS; \
2019566063dSJacob Faibussowitsch PetscCall(CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u)); \
202fcc7397dSJunchao Zhang } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
203fcc7397dSJunchao Zhang u += srcOpt->start[0] * MBS; \
204fcc7397dSJunchao Zhang v += dstStart * MBS; \
2059371c9d4SSatish Balay X = srcOpt->X[0]; \
2069371c9d4SSatish Balay Y = srcOpt->Y[0]; \
207fcc7397dSJunchao Zhang for (k = 0; k < srcOpt->dz[0]; k++) \
208fcc7397dSJunchao Zhang for (j = 0; j < srcOpt->dy[0]; j++) { \
209fcc7397dSJunchao Zhang for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \
210fcc7397dSJunchao Zhang v += srcOpt->dx[0] * MBS; \
211fcc7397dSJunchao Zhang } \
212fcc7397dSJunchao Zhang } else { /* all other cases */ \
213fcc7397dSJunchao Zhang for (i = 0; i < count; i++) { \
214fcc7397dSJunchao Zhang s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \
215fcc7397dSJunchao Zhang t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \
216cd620004SJunchao Zhang for (j = 0; j < M; j++) \
217fcc7397dSJunchao Zhang for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \
218fcc7397dSJunchao Zhang } \
219cd620004SJunchao Zhang } \
2203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); \
221cd620004SJunchao Zhang }
222cd620004SJunchao Zhang
223cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \
224d71ae5a4SJacob Faibussowitsch static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) \
225d71ae5a4SJacob Faibussowitsch { \
226fcc7397dSJunchao Zhang Type *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \
227fcc7397dSJunchao Zhang const Type *ldata = (const Type *)leafdata; \
228fcc7397dSJunchao Zhang PetscInt i, j, k, r, l, bs = link->bs; \
229cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs / BS; \
230cd620004SJunchao Zhang const PetscInt MBS = M * BS; \
231cd620004SJunchao Zhang PetscFunctionBegin; \
232fcc7397dSJunchao Zhang for (i = 0; i < count; i++) { \
233fcc7397dSJunchao Zhang r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \
234fcc7397dSJunchao Zhang l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \
235cd620004SJunchao Zhang for (j = 0; j < M; j++) \
236cd620004SJunchao Zhang for (k = 0; k < BS; k++) { \
237fcc7397dSJunchao Zhang lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \
238fcc7397dSJunchao Zhang OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \
23940e23c03SJunchao Zhang } \
24040e23c03SJunchao Zhang } \
2413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); \
24240e23c03SJunchao Zhang }
24340e23c03SJunchao Zhang
244b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
245b23bfdefSJunchao Zhang #define DEF_Pack(Type, BS, EQ) \
246d71ae5a4SJacob Faibussowitsch DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) \
247d71ae5a4SJacob Faibussowitsch { \
248eb02082bSJunchao Zhang link->h_Pack = CPPJoin4(Pack, Type, BS, EQ); \
249eb02082bSJunchao Zhang link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \
250cd620004SJunchao Zhang link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \
25140e23c03SJunchao Zhang }
25240e23c03SJunchao Zhang
253b23bfdefSJunchao Zhang /* Add, Mult ops */
254b23bfdefSJunchao Zhang #define DEF_Add(Type, BS, EQ) \
255d71ae5a4SJacob Faibussowitsch DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) \
256d71ae5a4SJacob Faibussowitsch { \
257eb02082bSJunchao Zhang link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \
258eb02082bSJunchao Zhang link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type, BS, EQ); \
259eb02082bSJunchao Zhang link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type, BS, EQ); \
260cd620004SJunchao Zhang link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \
261cd620004SJunchao Zhang link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type, BS, EQ); \
262cd620004SJunchao Zhang link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \
26340e23c03SJunchao Zhang }
26440e23c03SJunchao Zhang
265b23bfdefSJunchao Zhang /* Max, Min ops */
266b23bfdefSJunchao Zhang #define DEF_Cmp(Type, BS, EQ) \
267d71ae5a4SJacob Faibussowitsch DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) \
268d71ae5a4SJacob Faibussowitsch { \
269eb02082bSJunchao Zhang link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
270eb02082bSJunchao Zhang link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
271cd620004SJunchao Zhang link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
272cd620004SJunchao Zhang link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
273b23bfdefSJunchao Zhang }
274b23bfdefSJunchao Zhang
275b23bfdefSJunchao Zhang /* Logical ops.
276cd620004SJunchao Zhang The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
27740e23c03SJunchao Zhang the compilation warning "empty macro arguments are undefined in ISO C90"
27840e23c03SJunchao Zhang */
279b23bfdefSJunchao Zhang #define DEF_Log(Type, BS, EQ) \
280d71ae5a4SJacob Faibussowitsch DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) \
281d71ae5a4SJacob Faibussowitsch { \
282eb02082bSJunchao Zhang link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \
283eb02082bSJunchao Zhang link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \
284eb02082bSJunchao Zhang link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \
285cd620004SJunchao Zhang link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \
286cd620004SJunchao Zhang link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \
287cd620004SJunchao Zhang link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \
28840e23c03SJunchao Zhang }
28940e23c03SJunchao Zhang
290b23bfdefSJunchao Zhang /* Bitwise ops */
291b23bfdefSJunchao Zhang #define DEF_Bit(Type, BS, EQ) \
292d71ae5a4SJacob Faibussowitsch DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) \
293d71ae5a4SJacob Faibussowitsch { \
294eb02082bSJunchao Zhang link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \
295eb02082bSJunchao Zhang link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \
296eb02082bSJunchao Zhang link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \
297cd620004SJunchao Zhang link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \
298cd620004SJunchao Zhang link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \
299cd620004SJunchao Zhang link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \
30040e23c03SJunchao Zhang }
30140e23c03SJunchao Zhang
302cd620004SJunchao Zhang /* Maxloc, Minloc ops */
303cd620004SJunchao Zhang #define DEF_Xloc(Type, BS, EQ) \
304d71ae5a4SJacob Faibussowitsch DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) \
305d71ae5a4SJacob Faibussowitsch { \
306cd620004SJunchao Zhang link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
307cd620004SJunchao Zhang link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
308cd620004SJunchao Zhang link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
309cd620004SJunchao Zhang link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
31040e23c03SJunchao Zhang }
31140e23c03SJunchao Zhang
312b23bfdefSJunchao Zhang #define DEF_IntegerType(Type, BS, EQ) \
313d71ae5a4SJacob Faibussowitsch DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) \
314d71ae5a4SJacob Faibussowitsch { \
315b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
316b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
317b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
318b23bfdefSJunchao Zhang CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \
319b23bfdefSJunchao Zhang CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \
32040e23c03SJunchao Zhang }
32140e23c03SJunchao Zhang
322b23bfdefSJunchao Zhang #define DEF_RealType(Type, BS, EQ) \
323d71ae5a4SJacob Faibussowitsch DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) \
324d71ae5a4SJacob Faibussowitsch { \
325b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
326b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
327b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
328b23bfdefSJunchao Zhang }
32940e23c03SJunchao Zhang
33040e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
331b23bfdefSJunchao Zhang #define DEF_ComplexType(Type, BS, EQ) \
332d71ae5a4SJacob Faibussowitsch DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) \
333d71ae5a4SJacob Faibussowitsch { \
334b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
335b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
336b23bfdefSJunchao Zhang }
33740e23c03SJunchao Zhang #endif
338b23bfdefSJunchao Zhang
339b23bfdefSJunchao Zhang #define DEF_DumbType(Type, BS, EQ) \
340d71ae5a4SJacob Faibussowitsch DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) \
341d71ae5a4SJacob Faibussowitsch { \
342b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
343b23bfdefSJunchao Zhang }
344b23bfdefSJunchao Zhang
345b23bfdefSJunchao Zhang /* Maxloc, Minloc */
346cd620004SJunchao Zhang #define DEF_PairType(Type, BS, EQ) \
347d71ae5a4SJacob Faibussowitsch DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) \
348d71ae5a4SJacob Faibussowitsch { \
349cd620004SJunchao Zhang CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
350cd620004SJunchao Zhang CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \
351b23bfdefSJunchao Zhang }
352b23bfdefSJunchao Zhang
353b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 1, 1) /* unit = 1 MPIU_INT */
354b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */
355b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */
356b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */
357b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */
358b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */
359b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */
360b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
361b23bfdefSJunchao Zhang
362b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
3639371c9d4SSatish Balay DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0)
364b23bfdefSJunchao Zhang #endif
365b23bfdefSJunchao Zhang
366b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */
367b23bfdefSJunchao Zhang typedef signed char SignedChar;
3689371c9d4SSatish Balay DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0)
369b23bfdefSJunchao Zhang
370b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar;
3719371c9d4SSatish Balay DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0)
372b23bfdefSJunchao Zhang
3739371c9d4SSatish Balay DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0)
374b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
3759371c9d4SSatish Balay DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0)
376b23bfdefSJunchao Zhang #endif
377b23bfdefSJunchao Zhang
378cd620004SJunchao Zhang #define PairType(Type1, Type2) Type1##_##Type2
3799371c9d4SSatish Balay typedef struct {
3809371c9d4SSatish Balay int u;
3819371c9d4SSatish Balay int i;
3829371c9d4SSatish Balay } PairType(int, int);
3839371c9d4SSatish Balay typedef struct {
3849371c9d4SSatish Balay PetscInt u;
3859371c9d4SSatish Balay PetscInt i;
3869371c9d4SSatish Balay } PairType(PetscInt, PetscInt);
3879371c9d4SSatish Balay DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1)
388b23bfdefSJunchao Zhang
389b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */
3909371c9d4SSatish Balay DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0)
391b23bfdefSJunchao Zhang
392eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
3939371c9d4SSatish Balay DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0)
39440e23c03SJunchao Zhang
PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)395d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link)
396d71ae5a4SJacob Faibussowitsch {
397cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
39871438e86SJunchao Zhang PetscInt i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8;
39971438e86SJunchao Zhang
40071438e86SJunchao Zhang PetscFunctionBegin;
40171438e86SJunchao Zhang /* Destroy device-specific fields */
4029566063dSJacob Faibussowitsch if (link->deviceinited) PetscCall((*link->Destroy)(sf, link));
40371438e86SJunchao Zhang
40471438e86SJunchao Zhang /* Destroy host related fields */
4059566063dSJacob Faibussowitsch if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit));
40671438e86SJunchao Zhang if (!link->use_nvshmem) {
40771438e86SJunchao Zhang for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */
4089566063dSJacob Faibussowitsch if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i]));
40971438e86SJunchao Zhang }
4109566063dSJacob Faibussowitsch PetscCall(PetscFree(link->reqs));
41171438e86SJunchao Zhang for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
4129566063dSJacob Faibussowitsch PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]));
4139566063dSJacob Faibussowitsch PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]));
41471438e86SJunchao Zhang }
41571438e86SJunchao Zhang }
4169566063dSJacob Faibussowitsch PetscCall(PetscFree(link));
4173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
41871438e86SJunchao Zhang }
41971438e86SJunchao Zhang
PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void * rootdata,PetscMemType leafmtype,const void * leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink * mylink)420d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
421d71ae5a4SJacob Faibussowitsch {
422cd620004SJunchao Zhang PetscFunctionBegin;
4239566063dSJacob Faibussowitsch PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata));
42471438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM)
42571438e86SJunchao Zhang {
42671438e86SJunchao Zhang PetscBool use_nvshmem;
4279566063dSJacob Faibussowitsch PetscCall(PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem));
42871438e86SJunchao Zhang if (use_nvshmem) {
4299566063dSJacob Faibussowitsch PetscCall(PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
4303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
431cd620004SJunchao Zhang }
432cd620004SJunchao Zhang }
4337fd2d3dbSJunchao Zhang #endif
4349566063dSJacob Faibussowitsch PetscCall(PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
4353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
436cd620004SJunchao Zhang }
437cd620004SJunchao Zhang
PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void * rootdata,const void * leafdata,PetscCopyMode cmode,PetscSFLink * mylink)438d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink)
439d71ae5a4SJacob Faibussowitsch {
440cd620004SJunchao Zhang PetscSFLink link, *p;
44140e23c03SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
44240e23c03SJunchao Zhang
44340e23c03SJunchao Zhang PetscFunctionBegin;
44440e23c03SJunchao Zhang /* Look for types in cache */
44540e23c03SJunchao Zhang for (p = &bas->inuse; (link = *p); p = &link->next) {
44640e23c03SJunchao Zhang PetscBool match;
4479566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
448637e6665SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
44940e23c03SJunchao Zhang switch (cmode) {
450d71ae5a4SJacob Faibussowitsch case PETSC_OWN_POINTER:
451d71ae5a4SJacob Faibussowitsch *p = link->next;
452d71ae5a4SJacob Faibussowitsch break; /* Remove from inuse list */
453d71ae5a4SJacob Faibussowitsch case PETSC_USE_POINTER:
454d71ae5a4SJacob Faibussowitsch break;
455d71ae5a4SJacob Faibussowitsch default:
456d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode");
45740e23c03SJunchao Zhang }
45840e23c03SJunchao Zhang *mylink = link;
4593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
46040e23c03SJunchao Zhang }
46140e23c03SJunchao Zhang }
46240e23c03SJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack");
46340e23c03SJunchao Zhang }
46440e23c03SJunchao Zhang
PetscSFLinkReclaim(PetscSF sf,PetscSFLink * mylink)465d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink)
466d71ae5a4SJacob Faibussowitsch {
46740e23c03SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
46871438e86SJunchao Zhang PetscSFLink link = *mylink;
46940e23c03SJunchao Zhang
47040e23c03SJunchao Zhang PetscFunctionBegin;
47171438e86SJunchao Zhang link->rootdata = NULL;
47271438e86SJunchao Zhang link->leafdata = NULL;
47371438e86SJunchao Zhang link->next = bas->avail;
47471438e86SJunchao Zhang bas->avail = link;
47571438e86SJunchao Zhang *mylink = NULL;
4763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
477eb02082bSJunchao Zhang }
478eb02082bSJunchao Zhang
4799d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void * rootdata,const void * leafdata)480d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
481d71ae5a4SJacob Faibussowitsch {
482cd620004SJunchao Zhang PetscSFLink link, *p;
4839d1c8addSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
4849d1c8addSJunchao Zhang PetscBool match;
4859d1c8addSJunchao Zhang
4869d1c8addSJunchao Zhang PetscFunctionBegin;
487b458e8f1SJose E. Roman if (PetscDefined(USE_DEBUG)) {
48818fb5014SJunchao Zhang /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
48918fb5014SJunchao Zhang the potential overlapping since this process does not participate in communication. Overlapping is harmless.
49018fb5014SJunchao Zhang */
491637e6665SJunchao Zhang if (rootdata || leafdata) {
4929d1c8addSJunchao Zhang for (p = &bas->inuse; (link = *p); p = &link->next) {
4939566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
494c9cc58a2SBarry Smith PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata, PETSC_COMM_SELF, PETSC_ERR_SUP, "Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.", rootdata, leafdata);
4959d1c8addSJunchao Zhang }
49618fb5014SJunchao Zhang }
497b458e8f1SJose E. Roman }
4983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
4999d1c8addSJunchao Zhang }
5009d1c8addSJunchao Zhang
PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void * dst,PetscMemType srcmtype,const void * src,size_t n)501d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
502d71ae5a4SJacob Faibussowitsch {
50320c24465SJunchao Zhang PetscFunctionBegin;
5049566063dSJacob Faibussowitsch if (n) PetscCall(PetscMemcpy(dst, src, n));
5053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
50620c24465SJunchao Zhang }
50720c24465SJunchao Zhang
PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)508d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit)
509d71ae5a4SJacob Faibussowitsch {
510b23bfdefSJunchao Zhang PetscInt nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
511b23bfdefSJunchao Zhang PetscBool is2Int, is2PetscInt;
512d279a5e3SJunchao Zhang MPIU_Count ni, na, nc, nd;
513d279a5e3SJunchao Zhang PetscMPIInt combiner;
51440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
515b23bfdefSJunchao Zhang PetscInt nPetscComplex = 0;
51640e23c03SJunchao Zhang #endif
51740e23c03SJunchao Zhang
51840e23c03SJunchao Zhang PetscFunctionBegin;
5199566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar));
5209566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar));
521b23bfdefSJunchao Zhang /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
5229566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt));
5239566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt));
5249566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal));
52540e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
5269566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex));
52740e23c03SJunchao Zhang #endif
5289566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int));
5299566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt));
5300dd791a8SStefano Zampini /* TODO: should we also handle Fortran MPI_2REAL? */
531d279a5e3SJunchao Zhang PetscCallMPI(MPIPetsc_Type_get_envelope(unit, &ni, &na, &nc, &nd, &combiner));
5325ad15460SJunchao Zhang link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
533b23bfdefSJunchao Zhang link->bs = 1; /* default */
53440e23c03SJunchao Zhang
535eb02082bSJunchao Zhang if (is2Int) {
536cd620004SJunchao Zhang PackInit_PairType_int_int_1_1(link);
537eb02082bSJunchao Zhang link->bs = 1;
538eb02082bSJunchao Zhang link->unitbytes = 2 * sizeof(int);
5395ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
540eb02082bSJunchao Zhang link->basicunit = MPI_2INT;
5415ad15460SJunchao Zhang link->unit = MPI_2INT;
542eb02082bSJunchao Zhang } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
543cd620004SJunchao Zhang PackInit_PairType_PetscInt_PetscInt_1_1(link);
544eb02082bSJunchao Zhang link->bs = 1;
545eb02082bSJunchao Zhang link->unitbytes = 2 * sizeof(PetscInt);
546eb02082bSJunchao Zhang link->basicunit = MPIU_2INT;
5475ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
5485ad15460SJunchao Zhang link->unit = MPIU_2INT;
549eb02082bSJunchao Zhang } else if (nPetscReal) {
5509371c9d4SSatish Balay if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link);
5519371c9d4SSatish Balay else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link);
5529371c9d4SSatish Balay else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link);
5539371c9d4SSatish Balay else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link);
5549371c9d4SSatish Balay else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link);
5559371c9d4SSatish Balay else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link);
5569371c9d4SSatish Balay else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link);
5579371c9d4SSatish Balay else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link);
558b23bfdefSJunchao Zhang link->bs = nPetscReal;
559eb02082bSJunchao Zhang link->unitbytes = nPetscReal * sizeof(PetscReal);
56040e23c03SJunchao Zhang link->basicunit = MPIU_REAL;
5619371c9d4SSatish Balay if (link->bs == 1) {
5629371c9d4SSatish Balay link->isbuiltin = PETSC_TRUE;
5639371c9d4SSatish Balay link->unit = MPIU_REAL;
5649371c9d4SSatish Balay }
565b23bfdefSJunchao Zhang } else if (nPetscInt) {
5669371c9d4SSatish Balay if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link);
5679371c9d4SSatish Balay else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
5689371c9d4SSatish Balay else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link);
5699371c9d4SSatish Balay else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
5709371c9d4SSatish Balay else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link);
5719371c9d4SSatish Balay else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
5729371c9d4SSatish Balay else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link);
5739371c9d4SSatish Balay else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
574b23bfdefSJunchao Zhang link->bs = nPetscInt;
575eb02082bSJunchao Zhang link->unitbytes = nPetscInt * sizeof(PetscInt);
576b23bfdefSJunchao Zhang link->basicunit = MPIU_INT;
5779371c9d4SSatish Balay if (link->bs == 1) {
5789371c9d4SSatish Balay link->isbuiltin = PETSC_TRUE;
5799371c9d4SSatish Balay link->unit = MPIU_INT;
5809371c9d4SSatish Balay }
581b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
582b23bfdefSJunchao Zhang } else if (nInt) {
5839371c9d4SSatish Balay if (nInt == 8) PackInit_IntegerType_int_8_1(link);
5849371c9d4SSatish Balay else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link);
5859371c9d4SSatish Balay else if (nInt == 4) PackInit_IntegerType_int_4_1(link);
5869371c9d4SSatish Balay else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link);
5879371c9d4SSatish Balay else if (nInt == 2) PackInit_IntegerType_int_2_1(link);
5889371c9d4SSatish Balay else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link);
5899371c9d4SSatish Balay else if (nInt == 1) PackInit_IntegerType_int_1_1(link);
5909371c9d4SSatish Balay else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link);
591b23bfdefSJunchao Zhang link->bs = nInt;
592eb02082bSJunchao Zhang link->unitbytes = nInt * sizeof(int);
593b23bfdefSJunchao Zhang link->basicunit = MPI_INT;
5949371c9d4SSatish Balay if (link->bs == 1) {
5959371c9d4SSatish Balay link->isbuiltin = PETSC_TRUE;
5969371c9d4SSatish Balay link->unit = MPI_INT;
5979371c9d4SSatish Balay }
598b23bfdefSJunchao Zhang #endif
599b23bfdefSJunchao Zhang } else if (nSignedChar) {
6009371c9d4SSatish Balay if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link);
6019371c9d4SSatish Balay else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
6029371c9d4SSatish Balay else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link);
6039371c9d4SSatish Balay else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
6049371c9d4SSatish Balay else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link);
6059371c9d4SSatish Balay else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
6069371c9d4SSatish Balay else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link);
6079371c9d4SSatish Balay else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
608b23bfdefSJunchao Zhang link->bs = nSignedChar;
609eb02082bSJunchao Zhang link->unitbytes = nSignedChar * sizeof(SignedChar);
610b23bfdefSJunchao Zhang link->basicunit = MPI_SIGNED_CHAR;
6119371c9d4SSatish Balay if (link->bs == 1) {
6129371c9d4SSatish Balay link->isbuiltin = PETSC_TRUE;
6139371c9d4SSatish Balay link->unit = MPI_SIGNED_CHAR;
6149371c9d4SSatish Balay }
615b23bfdefSJunchao Zhang } else if (nUnsignedChar) {
6169371c9d4SSatish Balay if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link);
6179371c9d4SSatish Balay else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
6189371c9d4SSatish Balay else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link);
6199371c9d4SSatish Balay else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
6209371c9d4SSatish Balay else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link);
6219371c9d4SSatish Balay else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
6229371c9d4SSatish Balay else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link);
6239371c9d4SSatish Balay else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
624b23bfdefSJunchao Zhang link->bs = nUnsignedChar;
625eb02082bSJunchao Zhang link->unitbytes = nUnsignedChar * sizeof(UnsignedChar);
626b23bfdefSJunchao Zhang link->basicunit = MPI_UNSIGNED_CHAR;
6279371c9d4SSatish Balay if (link->bs == 1) {
6289371c9d4SSatish Balay link->isbuiltin = PETSC_TRUE;
6299371c9d4SSatish Balay link->unit = MPI_UNSIGNED_CHAR;
6309371c9d4SSatish Balay }
63140e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
632b23bfdefSJunchao Zhang } else if (nPetscComplex) {
6339371c9d4SSatish Balay if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link);
6349371c9d4SSatish Balay else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
6359371c9d4SSatish Balay else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link);
6369371c9d4SSatish Balay else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
6379371c9d4SSatish Balay else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link);
6389371c9d4SSatish Balay else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
6399371c9d4SSatish Balay else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link);
6409371c9d4SSatish Balay else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
641b23bfdefSJunchao Zhang link->bs = nPetscComplex;
642eb02082bSJunchao Zhang link->unitbytes = nPetscComplex * sizeof(PetscComplex);
64340e23c03SJunchao Zhang link->basicunit = MPIU_COMPLEX;
6449371c9d4SSatish Balay if (link->bs == 1) {
6459371c9d4SSatish Balay link->isbuiltin = PETSC_TRUE;
6469371c9d4SSatish Balay link->unit = MPIU_COMPLEX;
6479371c9d4SSatish Balay }
64840e23c03SJunchao Zhang #endif
64940e23c03SJunchao Zhang } else {
6506497c311SBarry Smith MPI_Aint lb, nbyte;
651e1187f0dSToby Isaac
6526497c311SBarry Smith PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte));
6536497c311SBarry Smith PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
654eb02082bSJunchao Zhang if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
6559371c9d4SSatish Balay if (nbyte == 4) PackInit_DumbType_char_4_1(link);
6569371c9d4SSatish Balay else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link);
6579371c9d4SSatish Balay else if (nbyte == 2) PackInit_DumbType_char_2_1(link);
6589371c9d4SSatish Balay else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link);
6599371c9d4SSatish Balay else if (nbyte == 1) PackInit_DumbType_char_1_1(link);
6609371c9d4SSatish Balay else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link);
6616497c311SBarry Smith PetscCall(PetscIntCast(nbyte, &link->bs));
662b23bfdefSJunchao Zhang link->unitbytes = nbyte;
663b23bfdefSJunchao Zhang link->basicunit = MPI_BYTE;
66440e23c03SJunchao Zhang } else {
665d279a5e3SJunchao Zhang PetscCall(PetscIntCast(nbyte / sizeof(int), &nInt));
6669371c9d4SSatish Balay if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link);
6679371c9d4SSatish Balay else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link);
6689371c9d4SSatish Balay else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link);
6699371c9d4SSatish Balay else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link);
6709371c9d4SSatish Balay else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link);
6719371c9d4SSatish Balay else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link);
6729371c9d4SSatish Balay else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link);
6739371c9d4SSatish Balay else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link);
674eb02082bSJunchao Zhang link->bs = nInt;
675b23bfdefSJunchao Zhang link->unitbytes = nbyte;
67640e23c03SJunchao Zhang link->basicunit = MPI_INT;
67740e23c03SJunchao Zhang }
6785ad15460SJunchao Zhang if (link->isbuiltin) link->unit = unit;
67940e23c03SJunchao Zhang }
680b23bfdefSJunchao Zhang
6819566063dSJacob Faibussowitsch if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit, &link->unit));
68220c24465SJunchao Zhang
68320c24465SJunchao Zhang link->Memcpy = PetscSFLinkMemcpy_Host;
6843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
68540e23c03SJunchao Zhang }
68640e23c03SJunchao Zhang
PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (** UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt *,void *,const void *))687d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *))
688d71ae5a4SJacob Faibussowitsch {
68940e23c03SJunchao Zhang PetscFunctionBegin;
69040e23c03SJunchao Zhang *UnpackAndOp = NULL;
69171438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) {
69283df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
693eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
694eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
695eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
696eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
697eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
698eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
699eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
700eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
701eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
702eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
703eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
704eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
705eb02082bSJunchao Zhang }
7067fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
70771438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) {
70883df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
709eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
710eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
711eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
712eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
713eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
714eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
715eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
716eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
717eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
718eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
719eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
720eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
72171438e86SJunchao Zhang } else if (PetscMemTypeDevice(mtype) && atomic) {
72283df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
723eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
724eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
725eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
726eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
727eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
728eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
729eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
730eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
731eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
732eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
733eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
734eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
735eb02082bSJunchao Zhang }
736eb02082bSJunchao Zhang #endif
7373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
73840e23c03SJunchao Zhang }
73940e23c03SJunchao Zhang
PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (** ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt *,const void *,PetscInt,PetscSFPackOpt,const PetscInt *,void *))740d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *))
741d71ae5a4SJacob Faibussowitsch {
74240e23c03SJunchao Zhang PetscFunctionBegin;
743cd620004SJunchao Zhang *ScatterAndOp = NULL;
74471438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) {
74583df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
746cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
747cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
748cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
749cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
750cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
751cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
752cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
753cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
754cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
755cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
756cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
757cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
758eb02082bSJunchao Zhang }
7597fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
76071438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) {
76183df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
762cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
763cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
764cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
765cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
766cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
767cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
768cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
769cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
770cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
771cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
772cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
773cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
77471438e86SJunchao Zhang } else if (PetscMemTypeDevice(mtype) && atomic) {
77583df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
776cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
777cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
778cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
779cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
780cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
781cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
782cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
783cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
784cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
785cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
786cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
787cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
788eb02082bSJunchao Zhang }
789eb02082bSJunchao Zhang #endif
7903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
791cd620004SJunchao Zhang }
792cd620004SJunchao Zhang
PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (** FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt *,void *,void *))793d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *))
794d71ae5a4SJacob Faibussowitsch {
795cd620004SJunchao Zhang PetscFunctionBegin;
796cd620004SJunchao Zhang *FetchAndOp = NULL;
797c9cc58a2SBarry Smith PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
79871438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
7997fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
80071438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
80171438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd;
802cd620004SJunchao Zhang #endif
8033ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
804cd620004SJunchao Zhang }
805cd620004SJunchao Zhang
PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (** FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt *,void *,PetscInt,PetscSFPackOpt,const PetscInt *,const void *,void *))806d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *))
807d71ae5a4SJacob Faibussowitsch {
808cd620004SJunchao Zhang PetscFunctionBegin;
809cd620004SJunchao Zhang *FetchAndOpLocal = NULL;
810c9cc58a2SBarry Smith PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
81171438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
8127fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
81371438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
81471438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
815cd620004SJunchao Zhang #endif
8163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
817cd620004SJunchao Zhang }
818cd620004SJunchao Zhang
PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)819d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
820d71ae5a4SJacob Faibussowitsch {
821cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
822cd620004SJunchao Zhang
823cd620004SJunchao Zhang PetscFunctionBegin;
82483df288dSJunchao Zhang if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
8257fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
82652fb3346SBarry Smith if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(bas->rootbuflen[scope] * link->bs));
8279371c9d4SSatish Balay else
828cd620004SJunchao Zhang #endif
82952fb3346SBarry Smith PetscCall(PetscLogFlops(bas->rootbuflen[scope] * link->bs)); /* # of roots in buffer x # of scalars in unit */
830cd620004SJunchao Zhang }
8313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
832cd620004SJunchao Zhang }
833cd620004SJunchao Zhang
PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)834d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
835d71ae5a4SJacob Faibussowitsch {
836cd620004SJunchao Zhang PetscFunctionBegin;
83783df288dSJunchao Zhang if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
8387fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
83952fb3346SBarry Smith if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(sf->leafbuflen[scope] * link->bs)); /* # of roots in buffer x # of scalars in unit */
8409371c9d4SSatish Balay else
841cd620004SJunchao Zhang #endif
84252fb3346SBarry Smith PetscCall(PetscLogFlops(sf->leafbuflen[scope] * link->bs));
843cd620004SJunchao Zhang }
8443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
845cd620004SJunchao Zhang }
846cd620004SJunchao Zhang
847cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
8484165533cSJose E. Roman Input Parameters:
849cd620004SJunchao Zhang +sf - The StarForest
850cd620004SJunchao Zhang .link - The link
851cd620004SJunchao Zhang .count - Number of entries to unpack
852145b44c9SPierre Jolivet .start - The first index, significant when indices=NULL
853cd620004SJunchao Zhang .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
854cd620004SJunchao Zhang .buf - A contiguous buffer to unpack from
855cd620004SJunchao Zhang -op - Operation after unpack
856cd620004SJunchao Zhang
8574165533cSJose E. Roman Output Parameters:
858cd620004SJunchao Zhang .data - The data to unpack to
859cd620004SJunchao Zhang */
PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt * indices,void * data,const void * buf,MPI_Op op)860d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op)
861d71ae5a4SJacob Faibussowitsch {
862cd620004SJunchao Zhang PetscFunctionBegin;
863cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
864cd620004SJunchao Zhang {
865cd620004SJunchao Zhang PetscInt i;
866cd620004SJunchao Zhang if (indices) {
867cd620004SJunchao Zhang /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
868cd620004SJunchao Zhang basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
869cd620004SJunchao Zhang */
8709566063dSJacob Faibussowitsch for (i = 0; i < count; i++) PetscCallMPI(MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op));
871cd620004SJunchao Zhang } else {
872*07e87b31SJunchao Zhang PetscCheck(op != MPI_REPLACE, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_REPLACE is not an allowed op in MPI_Reduce_local() per MPI standard");
8739566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op));
874cd620004SJunchao Zhang }
875cd620004SJunchao Zhang }
8763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
877cd620004SJunchao Zhang #else
878cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
879cd620004SJunchao Zhang #endif
880cd620004SJunchao Zhang }
881cd620004SJunchao Zhang
PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt * srcIdx,const void * src,PetscInt dstStart,const PetscInt * dstIdx,void * dst,MPI_Op op)882d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op)
883d71ae5a4SJacob Faibussowitsch {
884cd620004SJunchao Zhang PetscFunctionBegin;
885cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
886cd620004SJunchao Zhang {
887cd620004SJunchao Zhang PetscInt i, disp;
888fcc7397dSJunchao Zhang if (!srcIdx) {
8899566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op));
890fcc7397dSJunchao Zhang } else {
891*07e87b31SJunchao Zhang PetscCheck(op != MPI_REPLACE, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPI_REPLACE is not an allowed op in MPI_Reduce_local() per MPI standard");
892cd620004SJunchao Zhang for (i = 0; i < count; i++) {
893fcc7397dSJunchao Zhang disp = dstIdx ? dstIdx[i] : dstStart + i;
8949566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op));
895fcc7397dSJunchao Zhang }
896cd620004SJunchao Zhang }
897cd620004SJunchao Zhang }
8983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
899cd620004SJunchao Zhang #else
900cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
901cd620004SJunchao Zhang #endif
902cd620004SJunchao Zhang }
903cd620004SJunchao Zhang
904cd620004SJunchao Zhang /*=============================================================================
905cd620004SJunchao Zhang Pack/Unpack/Fetch/Scatter routines
906cd620004SJunchao Zhang ============================================================================*/
907cd620004SJunchao Zhang
908cd620004SJunchao Zhang /* Pack rootdata to rootbuf
9094165533cSJose E. Roman Input Parameters:
910cd620004SJunchao Zhang + sf - The SF this packing works on.
911cd620004SJunchao Zhang . link - It gives the memtype of the roots and also provides root buffer.
912cd620004SJunchao Zhang . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
913cd620004SJunchao Zhang - rootdata - Where to read the roots.
914cd620004SJunchao Zhang
915cd620004SJunchao Zhang Notes:
916cd620004SJunchao Zhang When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
91771438e86SJunchao Zhang in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
918cd620004SJunchao Zhang */
PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void * rootdata)91966976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
920d71ae5a4SJacob Faibussowitsch {
921cd620004SJunchao Zhang const PetscInt *rootindices = NULL;
922cd620004SJunchao Zhang PetscInt count, start;
923cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype;
924fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL;
9256497c311SBarry Smith PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
926fcc7397dSJunchao Zhang
927cd620004SJunchao Zhang PetscFunctionBegin;
92871438e86SJunchao Zhang if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
9299566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
9309566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetPack(link, rootmtype, &Pack));
9319566063dSJacob Faibussowitsch PetscCall((*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
932cd620004SJunchao Zhang }
9333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
934cd620004SJunchao Zhang }
935cd620004SJunchao Zhang
936cd620004SJunchao Zhang /* Pack leafdata to leafbuf */
PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void * leafdata)93766976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
938d71ae5a4SJacob Faibussowitsch {
939cd620004SJunchao Zhang const PetscInt *leafindices = NULL;
940cd620004SJunchao Zhang PetscInt count, start;
941cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype;
942fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL;
9436497c311SBarry Smith PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
944cd620004SJunchao Zhang
945cd620004SJunchao Zhang PetscFunctionBegin;
94671438e86SJunchao Zhang if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
9479566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
9489566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetPack(link, leafmtype, &Pack));
9499566063dSJacob Faibussowitsch PetscCall((*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
950cd620004SJunchao Zhang }
9513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
952cd620004SJunchao Zhang }
953cd620004SJunchao Zhang
95471438e86SJunchao Zhang /* Pack rootdata to rootbuf, which are in the same memory space */
PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void * rootdata)955d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
956d71ae5a4SJacob Faibussowitsch {
95771438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
95871438e86SJunchao Zhang
95971438e86SJunchao Zhang PetscFunctionBegin;
960f0b74427SPierre Jolivet if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on PETSc default stream */
9619566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
9629566063dSJacob Faibussowitsch if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
96371438e86SJunchao Zhang }
9649566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
9659566063dSJacob Faibussowitsch if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf, link, scope, rootdata));
9669566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
9673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
96871438e86SJunchao Zhang }
96971438e86SJunchao Zhang /* Pack leafdata to leafbuf, which are in the same memory space */
PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void * leafdata)970d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
971d71ae5a4SJacob Faibussowitsch {
97271438e86SJunchao Zhang PetscFunctionBegin;
97371438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) {
9749566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
9759566063dSJacob Faibussowitsch if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
97671438e86SJunchao Zhang }
9779566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
9789566063dSJacob Faibussowitsch if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata));
9799566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
9803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
98171438e86SJunchao Zhang }
98271438e86SJunchao Zhang
PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void * rootdata,MPI_Op op)98366976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
984d71ae5a4SJacob Faibussowitsch {
985cd620004SJunchao Zhang const PetscInt *rootindices = NULL;
986cd620004SJunchao Zhang PetscInt count, start;
987cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
988cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype;
989fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL;
9906497c311SBarry Smith PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
991cd620004SJunchao Zhang
992cd620004SJunchao Zhang PetscFunctionBegin;
99371438e86SJunchao Zhang if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
9949566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp));
995cd620004SJunchao Zhang if (UnpackAndOp) {
9969566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
9979566063dSJacob Faibussowitsch PetscCall((*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
998cd620004SJunchao Zhang } else {
9999566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices));
10009566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op));
1001cd620004SJunchao Zhang }
1002cd620004SJunchao Zhang }
10039566063dSJacob Faibussowitsch PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op));
10043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1005cd620004SJunchao Zhang }
1006cd620004SJunchao Zhang
PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void * leafdata,MPI_Op op)100766976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1008d71ae5a4SJacob Faibussowitsch {
1009cd620004SJunchao Zhang const PetscInt *leafindices = NULL;
1010cd620004SJunchao Zhang PetscInt count, start;
1011fcc7397dSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1012cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype;
1013fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL;
1014cd620004SJunchao Zhang
1015cd620004SJunchao Zhang PetscFunctionBegin;
101671438e86SJunchao Zhang if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
10179566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp));
1018cd620004SJunchao Zhang if (UnpackAndOp) {
10199566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
10209566063dSJacob Faibussowitsch PetscCall((*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
1021cd620004SJunchao Zhang } else {
10229566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices));
10239566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op));
1024cd620004SJunchao Zhang }
1025cd620004SJunchao Zhang }
10269566063dSJacob Faibussowitsch PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op));
10273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
102871438e86SJunchao Zhang }
102971438e86SJunchao Zhang /* Unpack rootbuf to rootdata, which are in the same memory space */
PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void * rootdata,MPI_Op op)1030d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1031d71ae5a4SJacob Faibussowitsch {
103271438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
103371438e86SJunchao Zhang
103471438e86SJunchao Zhang PetscFunctionBegin;
10354da51b66SZach Atkins PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); // call it even no data is unpacked so that -log_sync can be done collectively
10364da51b66SZach Atkins if (bas->rootbuflen[scope] && !link->rootdirect[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op));
10379566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
103871438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) {
10399566063dSJacob Faibussowitsch if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
10409566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
104171438e86SJunchao Zhang }
10423ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1043cd620004SJunchao Zhang }
1044cd620004SJunchao Zhang
104571438e86SJunchao Zhang /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void * leafdata,MPI_Op op)1046d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1047d71ae5a4SJacob Faibussowitsch {
104871438e86SJunchao Zhang PetscFunctionBegin;
10499566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
10504da51b66SZach Atkins if (sf->leafbuflen[scope] && !link->leafdirect[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op));
10519566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
105271438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) {
10539566063dSJacob Faibussowitsch if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
10549566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
105571438e86SJunchao Zhang }
10563ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
105771438e86SJunchao Zhang }
105871438e86SJunchao Zhang
105971438e86SJunchao Zhang /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void * rootdata,MPI_Op op)1060d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op)
1061d71ae5a4SJacob Faibussowitsch {
1062cd620004SJunchao Zhang const PetscInt *rootindices = NULL;
1063cd620004SJunchao Zhang PetscInt count, start;
1064cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1065fcc7397dSJunchao Zhang PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL;
1066cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype;
1067fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL;
1068cd620004SJunchao Zhang
1069cd620004SJunchao Zhang PetscFunctionBegin;
10709566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
107171438e86SJunchao Zhang if (bas->rootbuflen[PETSCSF_REMOTE]) {
1072cd620004SJunchao Zhang /* Do FetchAndOp on rootdata with rootbuf */
10739566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp));
10749566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices));
10759566063dSJacob Faibussowitsch PetscCall((*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype]));
1076cd620004SJunchao Zhang }
10779566063dSJacob Faibussowitsch PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op));
10789566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
10793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1080cd620004SJunchao Zhang }
1081cd620004SJunchao Zhang
PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void * rootdata,void * leafdata,MPI_Op op)1082d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op)
1083d71ae5a4SJacob Faibussowitsch {
1084cd620004SJunchao Zhang const PetscInt *rootindices = NULL, *leafindices = NULL;
1085cd620004SJunchao Zhang PetscInt count, rootstart, leafstart;
1086cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
108771438e86SJunchao Zhang PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype;
1088fcc7397dSJunchao Zhang PetscSFPackOpt leafopt = NULL, rootopt = NULL;
108971438e86SJunchao Zhang PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL];
109071438e86SJunchao Zhang char *srcbuf = NULL, *dstbuf = NULL;
109171438e86SJunchao Zhang PetscBool dstdups;
10926497c311SBarry Smith PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL;
1093cd620004SJunchao Zhang
1094362febeeSStefano Zampini PetscFunctionBegin;
10953ba16761SJacob Faibussowitsch if (!buflen) PetscFunctionReturn(PETSC_SUCCESS);
109671438e86SJunchao Zhang if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
109771438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) {
10989566063dSJacob Faibussowitsch PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata));
109971438e86SJunchao Zhang srcmtype = rootmtype;
110071438e86SJunchao Zhang srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype];
110171438e86SJunchao Zhang dstmtype = leafmtype;
110271438e86SJunchao Zhang dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype];
110371438e86SJunchao Zhang } else {
11049566063dSJacob Faibussowitsch PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata));
110571438e86SJunchao Zhang srcmtype = leafmtype;
110671438e86SJunchao Zhang srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype];
110771438e86SJunchao Zhang dstmtype = rootmtype;
110871438e86SJunchao Zhang dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype];
110971438e86SJunchao Zhang }
11109566063dSJacob Faibussowitsch PetscCall((*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes));
111171438e86SJunchao Zhang /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
11129566063dSJacob Faibussowitsch if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link));
111371438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) {
11149566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op));
1115cd620004SJunchao Zhang } else {
11169566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op));
111771438e86SJunchao Zhang }
111871438e86SJunchao Zhang } else {
111971438e86SJunchao Zhang dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
112071438e86SJunchao Zhang dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
11219566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp));
1122cd620004SJunchao Zhang if (ScatterAndOp) {
11239566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
11249566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
112571438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) {
11269566063dSJacob Faibussowitsch PetscCall((*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata));
1127cd620004SJunchao Zhang } else {
11289566063dSJacob Faibussowitsch PetscCall((*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata));
112971438e86SJunchao Zhang }
1130cd620004SJunchao Zhang } else {
11319566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
11329566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
113371438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) {
11349566063dSJacob Faibussowitsch PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op));
113571438e86SJunchao Zhang } else {
11369566063dSJacob Faibussowitsch PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op));
1137cd620004SJunchao Zhang }
1138cd620004SJunchao Zhang }
113971438e86SJunchao Zhang }
11403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1141cd620004SJunchao Zhang }
1142cd620004SJunchao Zhang
1143cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */
PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void * rootdata,const void * leafdata,void * leafupdate,MPI_Op op)1144d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1145d71ae5a4SJacob Faibussowitsch {
1146cd620004SJunchao Zhang const PetscInt *rootindices = NULL, *leafindices = NULL;
1147cd620004SJunchao Zhang PetscInt count, rootstart, leafstart;
1148cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1149cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype;
1150fcc7397dSJunchao Zhang PetscSFPackOpt leafopt = NULL, rootopt = NULL;
11516497c311SBarry Smith PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1152cd620004SJunchao Zhang
1153cd620004SJunchao Zhang PetscFunctionBegin;
11543ba16761SJacob Faibussowitsch if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(PETSC_SUCCESS);
1155cd620004SJunchao Zhang if (rootmtype != leafmtype) {
1156cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */
1157cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1158cd620004SJunchao Zhang } else {
11599566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
11609566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
11619566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal));
11629566063dSJacob Faibussowitsch PetscCall((*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate));
1163cd620004SJunchao Zhang }
11643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
116540e23c03SJunchao Zhang }
116640e23c03SJunchao Zhang
116740e23c03SJunchao Zhang /*
1168511e6246SStefano Zampini Create per-rank pack/unpack optimizations based on indices patterns
116940e23c03SJunchao Zhang
117040e23c03SJunchao Zhang Input Parameters:
1171fcc7397dSJunchao Zhang + n - Number of destination ranks
1172eb02082bSJunchao Zhang . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1173b23bfdefSJunchao Zhang - idx - [*] Array storing indices
117440e23c03SJunchao Zhang
117540e23c03SJunchao Zhang Output Parameters:
1176cd620004SJunchao Zhang + opt - Pack optimizations. NULL if no optimizations.
117740e23c03SJunchao Zhang */
PetscSFCreatePackOpt(PetscInt n,const PetscInt * offset,const PetscInt * idx,PetscSFPackOpt * out)117866976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out)
1179d71ae5a4SJacob Faibussowitsch {
1180fcc7397dSJunchao Zhang PetscInt r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y;
1181fcc7397dSJunchao Zhang PetscBool optimizable = PETSC_TRUE;
118240e23c03SJunchao Zhang PetscSFPackOpt opt;
118340e23c03SJunchao Zhang
118440e23c03SJunchao Zhang PetscFunctionBegin;
11859566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1, &opt));
11869566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(7 * n + 2, &opt->array));
1187fcc7397dSJunchao Zhang opt->n = opt->array[0] = n;
1188fcc7397dSJunchao Zhang opt->offset = opt->array + 1;
1189fcc7397dSJunchao Zhang opt->start = opt->array + n + 2;
1190fcc7397dSJunchao Zhang opt->dx = opt->array + 2 * n + 2;
1191fcc7397dSJunchao Zhang opt->dy = opt->array + 3 * n + 2;
1192fcc7397dSJunchao Zhang opt->dz = opt->array + 4 * n + 2;
1193fcc7397dSJunchao Zhang opt->X = opt->array + 5 * n + 2;
1194fcc7397dSJunchao Zhang opt->Y = opt->array + 6 * n + 2;
1195fcc7397dSJunchao Zhang
1196fcc7397dSJunchao Zhang for (r = 0; r < n; r++) { /* For each destination rank */
1197fcc7397dSJunchao Zhang m = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1198fcc7397dSJunchao Zhang p = offset[r];
1199fcc7397dSJunchao Zhang start = idx[p]; /* First index for this rank */
1200fcc7397dSJunchao Zhang p++;
1201fcc7397dSJunchao Zhang
1202fcc7397dSJunchao Zhang /* Search in X dimension */
1203fcc7397dSJunchao Zhang for (dx = 1; dx < m; dx++, p++) {
1204fcc7397dSJunchao Zhang if (start + dx != idx[p]) break;
1205b23bfdefSJunchao Zhang }
1206b23bfdefSJunchao Zhang
1207fcc7397dSJunchao Zhang dydz = m / dx;
1208fcc7397dSJunchao Zhang X = dydz > 1 ? (idx[p] - start) : dx;
1209fcc7397dSJunchao Zhang /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
12109371c9d4SSatish Balay if (m % dx || X <= 0) {
12119371c9d4SSatish Balay optimizable = PETSC_FALSE;
12129371c9d4SSatish Balay goto finish;
12139371c9d4SSatish Balay }
1214fcc7397dSJunchao Zhang for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */
1215fcc7397dSJunchao Zhang for (i = 0; i < dx; i++, p++) {
1216fcc7397dSJunchao Zhang if (start + X * dy + i != idx[p]) {
12179371c9d4SSatish Balay if (i) {
12189371c9d4SSatish Balay optimizable = PETSC_FALSE;
12199371c9d4SSatish Balay goto finish;
12209371c9d4SSatish Balay } /* The pattern is violated in the middle of an x-walk */
12219371c9d4SSatish Balay else
12229371c9d4SSatish Balay goto Z_dimension;
122340e23c03SJunchao Zhang }
122440e23c03SJunchao Zhang }
122540e23c03SJunchao Zhang }
122640e23c03SJunchao Zhang
1227fcc7397dSJunchao Zhang Z_dimension:
1228fcc7397dSJunchao Zhang dz = m / (dx * dy);
1229fcc7397dSJunchao Zhang Y = dz > 1 ? (idx[p] - start) / X : dy;
1230fcc7397dSJunchao Zhang /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
12319371c9d4SSatish Balay if (m % (dx * dy) || Y <= 0) {
12329371c9d4SSatish Balay optimizable = PETSC_FALSE;
12339371c9d4SSatish Balay goto finish;
12349371c9d4SSatish Balay }
1235fcc7397dSJunchao Zhang for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1236fcc7397dSJunchao Zhang for (j = 0; j < dy; j++) {
1237fcc7397dSJunchao Zhang for (i = 0; i < dx; i++, p++) {
12389371c9d4SSatish Balay if (start + X * Y * k + X * j + i != idx[p]) {
12399371c9d4SSatish Balay optimizable = PETSC_FALSE;
12409371c9d4SSatish Balay goto finish;
12419371c9d4SSatish Balay }
124240e23c03SJunchao Zhang }
124340e23c03SJunchao Zhang }
124440e23c03SJunchao Zhang }
1245fcc7397dSJunchao Zhang opt->start[r] = start;
1246fcc7397dSJunchao Zhang opt->dx[r] = dx;
1247fcc7397dSJunchao Zhang opt->dy[r] = dy;
1248fcc7397dSJunchao Zhang opt->dz[r] = dz;
1249fcc7397dSJunchao Zhang opt->X[r] = X;
1250fcc7397dSJunchao Zhang opt->Y[r] = Y;
125140e23c03SJunchao Zhang }
125240e23c03SJunchao Zhang
1253fcc7397dSJunchao Zhang finish:
1254fcc7397dSJunchao Zhang /* If not optimizable, free arrays to save memory */
1255fcc7397dSJunchao Zhang if (!n || !optimizable) {
12569566063dSJacob Faibussowitsch PetscCall(PetscFree(opt->array));
12579566063dSJacob Faibussowitsch PetscCall(PetscFree(opt));
125840e23c03SJunchao Zhang *out = NULL;
1259fcc7397dSJunchao Zhang } else {
1260fcc7397dSJunchao Zhang opt->offset[0] = 0;
1261fcc7397dSJunchao Zhang for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r];
1262fcc7397dSJunchao Zhang *out = opt;
1263fcc7397dSJunchao Zhang }
12643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
126540e23c03SJunchao Zhang }
126640e23c03SJunchao Zhang
PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt * out)1267d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out)
1268d71ae5a4SJacob Faibussowitsch {
126940e23c03SJunchao Zhang PetscSFPackOpt opt = *out;
127040e23c03SJunchao Zhang
127140e23c03SJunchao Zhang PetscFunctionBegin;
127240e23c03SJunchao Zhang if (opt) {
12739566063dSJacob Faibussowitsch PetscCall(PetscSFFree(sf, mtype, opt->array));
12749566063dSJacob Faibussowitsch PetscCall(PetscFree(opt));
127540e23c03SJunchao Zhang *out = NULL;
127640e23c03SJunchao Zhang }
12773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
127840e23c03SJunchao Zhang }
1279cd620004SJunchao Zhang
PetscSFSetUpPackFields(PetscSF sf)1280d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1281d71ae5a4SJacob Faibussowitsch {
1282cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1283cd620004SJunchao Zhang PetscInt i, j;
1284cd620004SJunchao Zhang
1285cd620004SJunchao Zhang PetscFunctionBegin;
1286cd620004SJunchao Zhang /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1287cd620004SJunchao Zhang for (i = 0; i < 2; i++) { /* Set defaults */
1288cd620004SJunchao Zhang sf->leafstart[i] = 0;
1289cd620004SJunchao Zhang sf->leafcontig[i] = PETSC_TRUE;
1290cd620004SJunchao Zhang sf->leafdups[i] = PETSC_FALSE;
1291cd620004SJunchao Zhang bas->rootstart[i] = 0;
1292cd620004SJunchao Zhang bas->rootcontig[i] = PETSC_TRUE;
1293cd620004SJunchao Zhang bas->rootdups[i] = PETSC_FALSE;
1294cd620004SJunchao Zhang }
1295cd620004SJunchao Zhang
1296cd620004SJunchao Zhang sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1297cd620004SJunchao Zhang sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1298cd620004SJunchao Zhang
1299cd620004SJunchao Zhang if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1300cd620004SJunchao Zhang if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1301cd620004SJunchao Zhang
1302cd620004SJunchao Zhang /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1303cd620004SJunchao Zhang for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */
13049371c9d4SSatish Balay if (sf->rmine[i] != sf->leafstart[0] + i) {
13059371c9d4SSatish Balay sf->leafcontig[0] = PETSC_FALSE;
13069371c9d4SSatish Balay break;
13079371c9d4SSatish Balay }
1308cd620004SJunchao Zhang }
1309cd620004SJunchao Zhang for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */
13109371c9d4SSatish Balay if (sf->rmine[i] != sf->leafstart[1] + j) {
13119371c9d4SSatish Balay sf->leafcontig[1] = PETSC_FALSE;
13129371c9d4SSatish Balay break;
13139371c9d4SSatish Balay }
1314cd620004SJunchao Zhang }
1315cd620004SJunchao Zhang
1316cd620004SJunchao Zhang /* If not, see if we can have per-rank optimizations by doing index analysis */
13179566063dSJacob Faibussowitsch if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]));
13189566063dSJacob Faibussowitsch if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1]));
1319cd620004SJunchao Zhang
1320cd620004SJunchao Zhang /* Are root indices for self and remote contiguous? */
1321cd620004SJunchao Zhang bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1322cd620004SJunchao Zhang bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1323cd620004SJunchao Zhang
1324cd620004SJunchao Zhang if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1325cd620004SJunchao Zhang if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1326cd620004SJunchao Zhang
1327cd620004SJunchao Zhang for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) {
13289371c9d4SSatish Balay if (bas->irootloc[i] != bas->rootstart[0] + i) {
13299371c9d4SSatish Balay bas->rootcontig[0] = PETSC_FALSE;
13309371c9d4SSatish Balay break;
13319371c9d4SSatish Balay }
1332cd620004SJunchao Zhang }
1333cd620004SJunchao Zhang for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) {
13349371c9d4SSatish Balay if (bas->irootloc[i] != bas->rootstart[1] + j) {
13359371c9d4SSatish Balay bas->rootcontig[1] = PETSC_FALSE;
13369371c9d4SSatish Balay break;
13379371c9d4SSatish Balay }
1338cd620004SJunchao Zhang }
1339cd620004SJunchao Zhang
13409566063dSJacob Faibussowitsch if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]));
13419566063dSJacob Faibussowitsch if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]));
1342cd620004SJunchao Zhang
1343cd620004SJunchao Zhang /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1344013b3241SStefano Zampini if (PetscDefined(HAVE_DEVICE)) {
1345013b3241SStefano Zampini PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
13469566063dSJacob Faibussowitsch if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]));
13479566063dSJacob Faibussowitsch if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine + sf->roffset[sf->ndranks], &sf->leafdups[1]));
13489566063dSJacob Faibussowitsch if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]));
13499566063dSJacob Faibussowitsch if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc + bas->ioffset[bas->ndiranks], &bas->rootdups[1]));
1350013b3241SStefano Zampini }
13513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1352cd620004SJunchao Zhang }
1353cd620004SJunchao Zhang
PetscSFResetPackFields(PetscSF sf)1354d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1355d71ae5a4SJacob Faibussowitsch {
1356cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1357cd620004SJunchao Zhang PetscInt i;
1358cd620004SJunchao Zhang
1359cd620004SJunchao Zhang PetscFunctionBegin;
1360cd620004SJunchao Zhang for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
13619566063dSJacob Faibussowitsch PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i]));
13629566063dSJacob Faibussowitsch PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i]));
13637fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
13649566063dSJacob Faibussowitsch PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i]));
13659566063dSJacob Faibussowitsch PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i]));
13667fd2d3dbSJunchao Zhang #endif
1367cd620004SJunchao Zhang }
13683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1369cd620004SJunchao Zhang }
1370