xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 9566063d113dddea24716c546802770db7481bc0)
120c24465SJunchao Zhang #include "petsc/private/sfimpl.h"
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
57fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
67fd2d3dbSJunchao Zhang 
740e23c03SJunchao Zhang /*
840e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
940e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1040e23c03SJunchao Zhang  */
1140e23c03SJunchao Zhang 
12cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
1340e23c03SJunchao Zhang 
14cd620004SJunchao Zhang /* Operations working like s += t */
15cd620004SJunchao Zhang #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
16cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
17cd620004SJunchao Zhang #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
18cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
19cd620004SJunchao Zhang /* Ref MPI MAXLOC */
20cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \
21cd620004SJunchao Zhang   do {                                       \
22cd620004SJunchao Zhang     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
23cd620004SJunchao Zhang     else if (!((s).u op (t).u)) s = t;           \
24cd620004SJunchao Zhang   } while (0)
2540e23c03SJunchao Zhang 
2640e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
2740e23c03SJunchao Zhang 
2840e23c03SJunchao Zhang    Arguments of the macro:
29b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30fcc7397dSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bsz.
31b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
3240e23c03SJunchao Zhang 
3340e23c03SJunchao Zhang    Arguments of the Pack routine:
34cd620004SJunchao Zhang    +count     Number of indices in idx[].
35fcc7397dSJunchao Zhang    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
36fcc7397dSJunchao Zhang    .opt       Per-pack optimization plan. NULL means no such plan.
37fcc7397dSJunchao Zhang    .idx       Indices of entries to packed.
38eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39cd620004SJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
40cd620004SJunchao Zhang    -packed    Address of the packed data.
4140e23c03SJunchao Zhang  */
42b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \
43fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
44b23bfdefSJunchao Zhang   {                                                                                                          \
45b23bfdefSJunchao Zhang     const Type     *u = (const Type*)unpacked,*u2;                                                           \
46b23bfdefSJunchao Zhang     Type           *p = (Type*)packed,*p2;                                                                   \
47fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
48fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
49b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
5040e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
51*9566063dSJacob Faibussowitsch     if (!idx) PetscCall(PetscArraycpy(p,u+start*MBS,MBS*count));/* idx[] are contiguous */                     \
52fcc7397dSJunchao Zhang     else if (opt) { /* has optimizations available */                                                        \
53fcc7397dSJunchao Zhang       p2 = p;                                                                                                \
54fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
55fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
56fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
57fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
58fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
59fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
60*9566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS));                                    \
61fcc7397dSJunchao Zhang             p2  += opt->dx[r]*MBS;                                                                           \
62fcc7397dSJunchao Zhang           }                                                                                                  \
63fcc7397dSJunchao Zhang       }                                                                                                      \
64fcc7397dSJunchao Zhang     } else {                                                                                                 \
65b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
66eb02082bSJunchao Zhang         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
67eb02082bSJunchao Zhang           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
68b23bfdefSJunchao Zhang             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
6940e23c03SJunchao Zhang     }                                                                                                        \
7040e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
7140e23c03SJunchao Zhang   }
7240e23c03SJunchao Zhang 
73cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
74cd620004SJunchao Zhang                 and inserts into a sparse array.
7540e23c03SJunchao Zhang 
7640e23c03SJunchao Zhang    Arguments:
77b23bfdefSJunchao Zhang   .Type       Type of the data
7840e23c03SJunchao Zhang   .BS         Block size for vectorization
79b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
8040e23c03SJunchao Zhang 
8140e23c03SJunchao Zhang   Notes:
8240e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
8340e23c03SJunchao Zhang  */
84cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ)               \
85fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
86b23bfdefSJunchao Zhang   {                                                                                                          \
87b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
88fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
89fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
90fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
91b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
9240e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
93b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
94fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
95*9566063dSJacob Faibussowitsch       if (u != p) PetscCall(PetscArraycpy(u,p,count*MBS));                                                     \
96fcc7397dSJunchao Zhang     } else if (opt) { /* has optimizations available */                                                      \
97fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
98fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
99fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
100fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
101fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
102fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
103*9566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS));                                     \
104fcc7397dSJunchao Zhang             p   += opt->dx[r]*MBS;                                                                           \
105fcc7397dSJunchao Zhang           }                                                                                                  \
106fcc7397dSJunchao Zhang       }                                                                                                      \
107fcc7397dSJunchao Zhang     } else {                                                                                                 \
108b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
109b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
110cd620004SJunchao Zhang           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
11140e23c03SJunchao Zhang     }                                                                                                        \
11240e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
11340e23c03SJunchao Zhang   }
11440e23c03SJunchao Zhang 
115cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
11640e23c03SJunchao Zhang 
11740e23c03SJunchao Zhang    Arguments:
118cd620004SJunchao Zhang   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
119b23bfdefSJunchao Zhang   .Type       Type of the data
12040e23c03SJunchao Zhang   .BS         Block size for vectorization
121b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
122cd620004SJunchao Zhang   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
123cd620004SJunchao Zhang   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
12440e23c03SJunchao Zhang  */
125cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
126fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
127b23bfdefSJunchao Zhang   {                                                                                                          \
128cd620004SJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
129fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
130fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
131fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
132b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
13340e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
134b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
135fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
136cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
137cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
138cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
139cd620004SJunchao Zhang             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
140fcc7397dSJunchao Zhang     } else if (opt) { /* idx[] has patterns */                                                               \
141fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
142fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
143fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
144fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
145fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
146fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
147fcc7397dSJunchao Zhang             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
148fcc7397dSJunchao Zhang             p += opt->dx[r]*MBS;                                                                             \
149fcc7397dSJunchao Zhang           }                                                                                                  \
150fcc7397dSJunchao Zhang       }                                                                                                      \
151fcc7397dSJunchao Zhang     } else {                                                                                                 \
152cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
153cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
154cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
155cd620004SJunchao Zhang             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
156cd620004SJunchao Zhang     }                                                                                                        \
157cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
158cd620004SJunchao Zhang   }
159cd620004SJunchao Zhang 
160cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
161fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
162cd620004SJunchao Zhang   {                                                                                                          \
163fcc7397dSJunchao Zhang     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
164fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
165fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
166fcc7397dSJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
167cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
168fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
169fcc7397dSJunchao Zhang       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
170fcc7397dSJunchao Zhang       l = i*MBS;                                                                                             \
171b23bfdefSJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
172b23bfdefSJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
173fcc7397dSJunchao Zhang           tmp = u[r+j*BS+k];                                                                                 \
174fcc7397dSJunchao Zhang           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
175fcc7397dSJunchao Zhang           p[l+j*BS+k] = tmp;                                                                                 \
176cd620004SJunchao Zhang         }                                                                                                    \
177cd620004SJunchao Zhang     }                                                                                                        \
178cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
179cd620004SJunchao Zhang   }
180cd620004SJunchao Zhang 
181cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
182fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
183cd620004SJunchao Zhang   {                                                                                                          \
184fcc7397dSJunchao Zhang     const Type     *u = (const Type*)src;                                                                    \
185fcc7397dSJunchao Zhang     Type           *v = (Type*)dst;                                                                          \
186fcc7397dSJunchao Zhang     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
187cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
188cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
189cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
190fcc7397dSJunchao Zhang     if (!srcIdx) { /* src is contiguous */                                                                   \
191fcc7397dSJunchao Zhang       u += srcStart*MBS;                                                                                     \
192*9566063dSJacob Faibussowitsch       PetscCall(CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u));              \
193fcc7397dSJunchao Zhang     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
194fcc7397dSJunchao Zhang       u += srcOpt->start[0]*MBS;                                                                             \
195fcc7397dSJunchao Zhang       v += dstStart*MBS;                                                                                     \
196fcc7397dSJunchao Zhang       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
197fcc7397dSJunchao Zhang       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
198fcc7397dSJunchao Zhang         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
199fcc7397dSJunchao Zhang           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
200fcc7397dSJunchao Zhang           v += srcOpt->dx[0]*MBS;                                                                            \
201fcc7397dSJunchao Zhang         }                                                                                                    \
202fcc7397dSJunchao Zhang     } else { /* all other cases */                                                                           \
203fcc7397dSJunchao Zhang       for (i=0; i<count; i++) {                                                                              \
204fcc7397dSJunchao Zhang         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
205fcc7397dSJunchao Zhang         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
206cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
207fcc7397dSJunchao Zhang           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
208fcc7397dSJunchao Zhang       }                                                                                                      \
209cd620004SJunchao Zhang     }                                                                                                        \
210cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
211cd620004SJunchao Zhang   }
212cd620004SJunchao Zhang 
213cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
214fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
215cd620004SJunchao Zhang   {                                                                                                          \
216fcc7397dSJunchao Zhang     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
217fcc7397dSJunchao Zhang     const Type     *ldata = (const Type*)leafdata;                                                           \
218fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
219cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
220cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
221cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
222fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
223fcc7397dSJunchao Zhang       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
224fcc7397dSJunchao Zhang       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
225cd620004SJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
226cd620004SJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
227fcc7397dSJunchao Zhang           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
228fcc7397dSJunchao Zhang           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
22940e23c03SJunchao Zhang         }                                                                                                    \
23040e23c03SJunchao Zhang     }                                                                                                        \
23140e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
23240e23c03SJunchao Zhang   }
23340e23c03SJunchao Zhang 
234b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
235b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ)                                                      \
236b23bfdefSJunchao Zhang   DEF_PackFunc(Type,BS,EQ)                                                        \
237cd620004SJunchao Zhang   DEF_UnpackFunc(Type,BS,EQ)                                                      \
238cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
239cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
240eb02082bSJunchao Zhang     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
241eb02082bSJunchao Zhang     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
242cd620004SJunchao Zhang     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
24340e23c03SJunchao Zhang   }
24440e23c03SJunchao Zhang 
245b23bfdefSJunchao Zhang /* Add, Mult ops */
246b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ)                                                       \
247cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
248cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
249cd620004SJunchao Zhang   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
250cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
251cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
252cd620004SJunchao Zhang   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
253cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
254eb02082bSJunchao Zhang     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
255eb02082bSJunchao Zhang     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
256eb02082bSJunchao Zhang     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
257cd620004SJunchao Zhang     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
258cd620004SJunchao Zhang     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
259cd620004SJunchao Zhang     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
26040e23c03SJunchao Zhang   }
26140e23c03SJunchao Zhang 
262b23bfdefSJunchao Zhang /* Max, Min ops */
263b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ)                                                       \
264cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
265cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
266cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
267cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
268cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
269eb02082bSJunchao Zhang     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
270eb02082bSJunchao Zhang     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
271cd620004SJunchao Zhang     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
272cd620004SJunchao Zhang     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
273b23bfdefSJunchao Zhang   }
274b23bfdefSJunchao Zhang 
275b23bfdefSJunchao Zhang /* Logical ops.
276cd620004SJunchao Zhang   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
27740e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
27840e23c03SJunchao Zhang  */
279b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ)                                                       \
280cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
281cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
282cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
283cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
284cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
285cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
286cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
287eb02082bSJunchao Zhang     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
288eb02082bSJunchao Zhang     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
289eb02082bSJunchao Zhang     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
290cd620004SJunchao Zhang     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
291cd620004SJunchao Zhang     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
292cd620004SJunchao Zhang     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
29340e23c03SJunchao Zhang   }
29440e23c03SJunchao Zhang 
295b23bfdefSJunchao Zhang /* Bitwise ops */
296b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ)                                                       \
297cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
298cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
299cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
300cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
301cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
302cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
303cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
304eb02082bSJunchao Zhang     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
305eb02082bSJunchao Zhang     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
306eb02082bSJunchao Zhang     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
307cd620004SJunchao Zhang     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
308cd620004SJunchao Zhang     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
309cd620004SJunchao Zhang     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
31040e23c03SJunchao Zhang   }
31140e23c03SJunchao Zhang 
312cd620004SJunchao Zhang /* Maxloc, Minloc ops */
313cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ)                                                      \
314cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
315cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
316cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
317cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
318cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
319cd620004SJunchao Zhang     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
320cd620004SJunchao Zhang     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
321cd620004SJunchao Zhang     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
322cd620004SJunchao Zhang     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
32340e23c03SJunchao Zhang   }
32440e23c03SJunchao Zhang 
325b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ)                                               \
326b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
327b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
328b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
329b23bfdefSJunchao Zhang   DEF_Log(Type,BS,EQ)                                                             \
330b23bfdefSJunchao Zhang   DEF_Bit(Type,BS,EQ)                                                             \
331cd620004SJunchao Zhang   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
332b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
333b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
334b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
335b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
336b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
33740e23c03SJunchao Zhang   }
33840e23c03SJunchao Zhang 
339b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ)                                                  \
340b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
341b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
342b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
343cd620004SJunchao Zhang   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
344b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
345b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
346b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
347b23bfdefSJunchao Zhang   }
34840e23c03SJunchao Zhang 
34940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
350b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ)                                               \
351b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
352b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
353cd620004SJunchao Zhang   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
354b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
355b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
356b23bfdefSJunchao Zhang   }
35740e23c03SJunchao Zhang #endif
358b23bfdefSJunchao Zhang 
359b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ)                                                  \
360b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
361cd620004SJunchao Zhang   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
362b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
363b23bfdefSJunchao Zhang   }
364b23bfdefSJunchao Zhang 
365b23bfdefSJunchao Zhang /* Maxloc, Minloc */
366cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ)                                                  \
367cd620004SJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
368cd620004SJunchao Zhang   DEF_Xloc(Type,BS,EQ)                                                            \
369cd620004SJunchao Zhang   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
370cd620004SJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
371cd620004SJunchao Zhang     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
372b23bfdefSJunchao Zhang   }
373b23bfdefSJunchao Zhang 
374b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
375b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
376b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
377b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
378b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
379b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
380b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
381b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
382b23bfdefSJunchao Zhang 
383b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
384b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1)
385b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1)
386b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1)
387b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1)
388b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0)
389b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0)
390b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0)
391b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0)
392b23bfdefSJunchao Zhang #endif
393b23bfdefSJunchao Zhang 
394b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */
395b23bfdefSJunchao Zhang typedef signed char SignedChar;
396b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1)
397b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1)
398b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1)
399b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1)
400b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0)
401b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0)
402b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0)
403b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0)
404b23bfdefSJunchao Zhang 
405b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar;
406b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1)
407b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1)
408b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1)
409b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1)
410b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0)
411b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0)
412b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0)
413b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0)
414b23bfdefSJunchao Zhang 
415b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1)
416b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1)
417b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1)
418b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1)
419b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0)
420b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0)
421b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0)
422b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0)
423b23bfdefSJunchao Zhang 
424b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
425b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1)
426b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1)
427b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1)
428b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1)
429b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0)
430b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0)
431b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0)
432b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0)
433b23bfdefSJunchao Zhang #endif
434b23bfdefSJunchao Zhang 
435cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2
436cd620004SJunchao Zhang typedef struct {int u; int i;}           PairType(int,int);
437cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
438cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1)
439cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1)
440b23bfdefSJunchao Zhang 
441b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */
442b23bfdefSJunchao Zhang DEF_DumbType(char,1,1)
443b23bfdefSJunchao Zhang DEF_DumbType(char,2,1)
444b23bfdefSJunchao Zhang DEF_DumbType(char,4,1)
445b23bfdefSJunchao Zhang DEF_DumbType(char,1,0)
446b23bfdefSJunchao Zhang DEF_DumbType(char,2,0)
447b23bfdefSJunchao Zhang DEF_DumbType(char,4,0)
448b23bfdefSJunchao Zhang 
449eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
450b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1)
451b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1)
452b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1)
453b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1)
454b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0)
455b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0)
456b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0)
457b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0)
45840e23c03SJunchao Zhang 
45971438e86SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)
46040e23c03SJunchao Zhang {
461cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
46271438e86SJunchao Zhang   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
46371438e86SJunchao Zhang 
46471438e86SJunchao Zhang   PetscFunctionBegin;
46571438e86SJunchao Zhang   /* Destroy device-specific fields */
466*9566063dSJacob Faibussowitsch   if (link->deviceinited) PetscCall((*link->Destroy)(sf,link));
46771438e86SJunchao Zhang 
46871438e86SJunchao Zhang   /* Destroy host related fields */
469*9566063dSJacob Faibussowitsch   if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit));
47071438e86SJunchao Zhang   if (!link->use_nvshmem) {
47171438e86SJunchao Zhang     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
472*9566063dSJacob Faibussowitsch       if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i]));
47371438e86SJunchao Zhang     }
474*9566063dSJacob Faibussowitsch     PetscCall(PetscFree(link->reqs));
47571438e86SJunchao Zhang     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
476*9566063dSJacob Faibussowitsch       PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]));
477*9566063dSJacob Faibussowitsch       PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]));
47871438e86SJunchao Zhang     }
47971438e86SJunchao Zhang   }
480*9566063dSJacob Faibussowitsch   PetscCall(PetscFree(link));
48171438e86SJunchao Zhang   PetscFunctionReturn(0);
48271438e86SJunchao Zhang }
48371438e86SJunchao Zhang 
48471438e86SJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
48571438e86SJunchao Zhang {
486cd620004SJunchao Zhang   PetscFunctionBegin;
487*9566063dSJacob Faibussowitsch   PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata));
48871438e86SJunchao Zhang  #if defined(PETSC_HAVE_NVSHMEM)
48971438e86SJunchao Zhang   {
49071438e86SJunchao Zhang     PetscBool use_nvshmem;
491*9566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem));
49271438e86SJunchao Zhang     if (use_nvshmem) {
493*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink));
49471438e86SJunchao Zhang       PetscFunctionReturn(0);
495cd620004SJunchao Zhang     }
496cd620004SJunchao Zhang   }
4977fd2d3dbSJunchao Zhang  #endif
498*9566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink));
499cd620004SJunchao Zhang   PetscFunctionReturn(0);
500cd620004SJunchao Zhang }
501cd620004SJunchao Zhang 
502cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
503cd620004SJunchao Zhang    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
504cd620004SJunchao Zhang */
505cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
506cd620004SJunchao Zhang {
507cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
508c87b50c4SJunchao Zhang   PetscInt             i,j,cnt,nrootranks,ndrootranks,nleafranks,ndleafranks;
509cd620004SJunchao Zhang   const PetscInt       *rootoffset,*leafoffset;
510cd620004SJunchao Zhang   MPI_Aint             disp;
511cd620004SJunchao Zhang   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
512cd620004SJunchao Zhang   MPI_Datatype         unit = link->unit;
513cd620004SJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
514cd620004SJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
515cd620004SJunchao Zhang 
516cd620004SJunchao Zhang   PetscFunctionBegin;
517cd620004SJunchao Zhang   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
518cd620004SJunchao Zhang   if (sf->persistent) {
519cd620004SJunchao Zhang     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
520*9566063dSJacob Faibussowitsch       PetscCall(PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL));
521cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
522cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
523cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
524c87b50c4SJunchao Zhang           cnt  = rootoffset[i+1]-rootoffset[i];
525*9566063dSJacob Faibussowitsch           PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j));
526cd620004SJunchao Zhang         }
527cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
528cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
529cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
530c87b50c4SJunchao Zhang           cnt  = rootoffset[i+1]-rootoffset[i];
531*9566063dSJacob Faibussowitsch           PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j));
532cd620004SJunchao Zhang         }
533cd620004SJunchao Zhang       }
534cd620004SJunchao Zhang       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
535cd620004SJunchao Zhang     }
536cd620004SJunchao Zhang 
537cd620004SJunchao Zhang     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
538*9566063dSJacob Faibussowitsch       PetscCall(PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL));
539cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
540cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
541cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
542c87b50c4SJunchao Zhang           cnt  = leafoffset[i+1]-leafoffset[i];
543*9566063dSJacob Faibussowitsch           PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j));
544cd620004SJunchao Zhang         }
545cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
546cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
547cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
548c87b50c4SJunchao Zhang           cnt  = leafoffset[i+1]-leafoffset[i];
549*9566063dSJacob Faibussowitsch           PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j));
550cd620004SJunchao Zhang         }
551cd620004SJunchao Zhang       }
552cd620004SJunchao Zhang       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
553cd620004SJunchao Zhang     }
554cd620004SJunchao Zhang   }
555cd620004SJunchao Zhang   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
556cd620004SJunchao Zhang   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
557cd620004SJunchao Zhang   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
558cd620004SJunchao Zhang   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
559cd620004SJunchao Zhang   PetscFunctionReturn(0);
560cd620004SJunchao Zhang }
561cd620004SJunchao Zhang 
562cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
563cd620004SJunchao Zhang {
564cd620004SJunchao Zhang   PetscSFLink       link,*p;
56540e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
56640e23c03SJunchao Zhang 
56740e23c03SJunchao Zhang   PetscFunctionBegin;
56840e23c03SJunchao Zhang   /* Look for types in cache */
56940e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
57040e23c03SJunchao Zhang     PetscBool match;
571*9566063dSJacob Faibussowitsch     PetscCall(MPIPetsc_Type_compare(unit,link->unit,&match));
572637e6665SJunchao Zhang     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
57340e23c03SJunchao Zhang       switch (cmode) {
57440e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
57540e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
57640e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
57740e23c03SJunchao Zhang       }
57840e23c03SJunchao Zhang       *mylink = link;
57940e23c03SJunchao Zhang       PetscFunctionReturn(0);
58040e23c03SJunchao Zhang     }
58140e23c03SJunchao Zhang   }
58240e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
58340e23c03SJunchao Zhang }
58440e23c03SJunchao Zhang 
58571438e86SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink)
58640e23c03SJunchao Zhang {
58740e23c03SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
58871438e86SJunchao Zhang   PetscSFLink       link = *mylink;
58940e23c03SJunchao Zhang 
59040e23c03SJunchao Zhang   PetscFunctionBegin;
59171438e86SJunchao Zhang   link->rootdata = NULL;
59271438e86SJunchao Zhang   link->leafdata = NULL;
59371438e86SJunchao Zhang   link->next     = bas->avail;
59471438e86SJunchao Zhang   bas->avail     = link;
59571438e86SJunchao Zhang   *mylink        = NULL;
596eb02082bSJunchao Zhang   PetscFunctionReturn(0);
597eb02082bSJunchao Zhang }
598eb02082bSJunchao Zhang 
5999d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
600cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
6019d1c8addSJunchao Zhang {
602cd620004SJunchao Zhang   PetscSFLink       link,*p;
6039d1c8addSJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
6049d1c8addSJunchao Zhang   PetscBool         match;
6059d1c8addSJunchao Zhang 
6069d1c8addSJunchao Zhang   PetscFunctionBegin;
607b458e8f1SJose E. Roman   if (PetscDefined(USE_DEBUG)) {
60818fb5014SJunchao Zhang     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
60918fb5014SJunchao Zhang        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
61018fb5014SJunchao Zhang     */
611637e6665SJunchao Zhang     if (rootdata || leafdata) {
6129d1c8addSJunchao Zhang       for (p=&bas->inuse; (link=*p); p=&link->next) {
613*9566063dSJacob Faibussowitsch         PetscCall(MPIPetsc_Type_compare(unit,link->unit,&match));
6142c71b3e2SJacob Faibussowitsch         PetscCheckFalse(match && (rootdata == link->rootdata) && (leafdata == link->leafdata),PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
6159d1c8addSJunchao Zhang       }
61618fb5014SJunchao Zhang     }
617b458e8f1SJose E. Roman   }
6189d1c8addSJunchao Zhang   PetscFunctionReturn(0);
6199d1c8addSJunchao Zhang }
6209d1c8addSJunchao Zhang 
62120c24465SJunchao Zhang static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
62220c24465SJunchao Zhang {
62320c24465SJunchao Zhang   PetscFunctionBegin;
624*9566063dSJacob Faibussowitsch   if (n) PetscCall(PetscMemcpy(dst,src,n));
62520c24465SJunchao Zhang   PetscFunctionReturn(0);
62620c24465SJunchao Zhang }
62720c24465SJunchao Zhang 
628cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
62940e23c03SJunchao Zhang {
630b23bfdefSJunchao Zhang   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
631b23bfdefSJunchao Zhang   PetscBool      is2Int,is2PetscInt;
63240e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
63340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
634b23bfdefSJunchao Zhang   PetscInt       nPetscComplex=0;
63540e23c03SJunchao Zhang #endif
63640e23c03SJunchao Zhang 
63740e23c03SJunchao Zhang   PetscFunctionBegin;
638*9566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar));
639*9566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar));
640b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
641*9566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt));
642*9566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt));
643*9566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal));
64440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
645*9566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex));
64640e23c03SJunchao Zhang #endif
647*9566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int));
648*9566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt));
649b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
650*9566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner));
6515ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
652b23bfdefSJunchao Zhang   link->bs = 1; /* default */
65340e23c03SJunchao Zhang 
654eb02082bSJunchao Zhang   if (is2Int) {
655cd620004SJunchao Zhang     PackInit_PairType_int_int_1_1(link);
656eb02082bSJunchao Zhang     link->bs        = 1;
657eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(int);
6585ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
659eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
6605ad15460SJunchao Zhang     link->unit      = MPI_2INT;
661eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
662cd620004SJunchao Zhang     PackInit_PairType_PetscInt_PetscInt_1_1(link);
663eb02082bSJunchao Zhang     link->bs        = 1;
664eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(PetscInt);
665eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
6665ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
6675ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
668eb02082bSJunchao Zhang   } else if (nPetscReal) {
669b23bfdefSJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
670b23bfdefSJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
671b23bfdefSJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
672b23bfdefSJunchao Zhang     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
673b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
674eb02082bSJunchao Zhang     link->unitbytes = nPetscReal*sizeof(PetscReal);
67540e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
6765ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
677b23bfdefSJunchao Zhang   } else if (nPetscInt) {
678b23bfdefSJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
679b23bfdefSJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
680b23bfdefSJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
681b23bfdefSJunchao Zhang     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
682b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
683eb02082bSJunchao Zhang     link->unitbytes = nPetscInt*sizeof(PetscInt);
684b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
6855ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
686b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
687b23bfdefSJunchao Zhang   } else if (nInt) {
688b23bfdefSJunchao Zhang     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
689b23bfdefSJunchao Zhang     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
690b23bfdefSJunchao Zhang     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
691b23bfdefSJunchao Zhang     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
692b23bfdefSJunchao Zhang     link->bs        = nInt;
693eb02082bSJunchao Zhang     link->unitbytes = nInt*sizeof(int);
694b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
6955ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
696b23bfdefSJunchao Zhang #endif
697b23bfdefSJunchao Zhang   } else if (nSignedChar) {
698b23bfdefSJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
699b23bfdefSJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
700b23bfdefSJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
701b23bfdefSJunchao Zhang     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
702b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
703eb02082bSJunchao Zhang     link->unitbytes = nSignedChar*sizeof(SignedChar);
704b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
7055ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
706b23bfdefSJunchao Zhang   }  else if (nUnsignedChar) {
707b23bfdefSJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
708b23bfdefSJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
709b23bfdefSJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
710b23bfdefSJunchao Zhang     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
711b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
712eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
713b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
7145ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
71540e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
716b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
717b23bfdefSJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
718b23bfdefSJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
719b23bfdefSJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
720b23bfdefSJunchao Zhang     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
721b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
722eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
72340e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
7245ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
72540e23c03SJunchao Zhang #endif
72640e23c03SJunchao Zhang   } else {
727b23bfdefSJunchao Zhang     MPI_Aint lb,nbyte;
728*9566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Type_get_extent(unit,&lb,&nbyte));
7292c71b3e2SJacob Faibussowitsch     PetscCheckFalse(lb != 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb);
730eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
731eb02082bSJunchao Zhang       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
732eb02082bSJunchao Zhang       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
733eb02082bSJunchao Zhang       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
734eb02082bSJunchao Zhang       link->bs        = nbyte;
735b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
736b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
73740e23c03SJunchao Zhang     } else {
738eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
739eb02082bSJunchao Zhang       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
740eb02082bSJunchao Zhang       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
741eb02082bSJunchao Zhang       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
742eb02082bSJunchao Zhang       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
743eb02082bSJunchao Zhang       link->bs        = nInt;
744b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
74540e23c03SJunchao Zhang       link->basicunit = MPI_INT;
74640e23c03SJunchao Zhang     }
7475ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
74840e23c03SJunchao Zhang   }
749b23bfdefSJunchao Zhang 
750*9566063dSJacob Faibussowitsch   if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit,&link->unit));
75120c24465SJunchao Zhang 
75220c24465SJunchao Zhang   link->Memcpy = PetscSFLinkMemcpy_Host;
75340e23c03SJunchao Zhang   PetscFunctionReturn(0);
75440e23c03SJunchao Zhang }
75540e23c03SJunchao Zhang 
756fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
75740e23c03SJunchao Zhang {
75840e23c03SJunchao Zhang   PetscFunctionBegin;
75940e23c03SJunchao Zhang   *UnpackAndOp = NULL;
76071438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {
76183df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *UnpackAndOp = link->h_UnpackAndInsert;
762eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
763eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
764eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
765eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
766eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
767eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
768eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
769eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
770eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
771eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
772eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
773eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
774eb02082bSJunchao Zhang   }
7757fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
77671438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) {
77783df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *UnpackAndOp = link->d_UnpackAndInsert;
778eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
779eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
780eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
781eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
782eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
783eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
784eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
785eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
786eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
787eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
788eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
789eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
79071438e86SJunchao Zhang   } else if (PetscMemTypeDevice(mtype) && atomic) {
79183df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *UnpackAndOp = link->da_UnpackAndInsert;
792eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
793eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
794eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
795eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
796eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
797eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
798eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
799eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
800eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
801eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
802eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
803eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
804eb02082bSJunchao Zhang   }
805eb02082bSJunchao Zhang #endif
80640e23c03SJunchao Zhang   PetscFunctionReturn(0);
80740e23c03SJunchao Zhang }
80840e23c03SJunchao Zhang 
809fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
81040e23c03SJunchao Zhang {
81140e23c03SJunchao Zhang   PetscFunctionBegin;
812cd620004SJunchao Zhang   *ScatterAndOp = NULL;
81371438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {
81483df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *ScatterAndOp = link->h_ScatterAndInsert;
815cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
816cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
817cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
818cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
819cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
820cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
821cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
822cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
823cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
824cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
825cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
826cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
827eb02082bSJunchao Zhang   }
8287fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
82971438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) {
83083df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *ScatterAndOp = link->d_ScatterAndInsert;
831cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
832cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
833cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
834cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
835cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
836cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
837cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
838cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
839cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
840cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
841cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
842cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
84371438e86SJunchao Zhang   } else if (PetscMemTypeDevice(mtype) && atomic) {
84483df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *ScatterAndOp = link->da_ScatterAndInsert;
845cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
846cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
847cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
848cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
849cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
850cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
851cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
852cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
853cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
854cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
855cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
856cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
857eb02082bSJunchao Zhang   }
858eb02082bSJunchao Zhang #endif
859cd620004SJunchao Zhang   PetscFunctionReturn(0);
860cd620004SJunchao Zhang }
861cd620004SJunchao Zhang 
862fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
863cd620004SJunchao Zhang {
864cd620004SJunchao Zhang   PetscFunctionBegin;
865cd620004SJunchao Zhang   *FetchAndOp = NULL;
8662c71b3e2SJacob Faibussowitsch   PetscCheckFalse(op != MPI_SUM && op != MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
86771438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
8687fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
86971438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
87071438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOp = link->da_FetchAndAdd;
871cd620004SJunchao Zhang #endif
872cd620004SJunchao Zhang   PetscFunctionReturn(0);
873cd620004SJunchao Zhang }
874cd620004SJunchao Zhang 
875fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
876cd620004SJunchao Zhang {
877cd620004SJunchao Zhang   PetscFunctionBegin;
878cd620004SJunchao Zhang   *FetchAndOpLocal = NULL;
8792c71b3e2SJacob Faibussowitsch   PetscCheckFalse(op != MPI_SUM && op != MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
88071438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
8817fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
88271438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
88371438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
884cd620004SJunchao Zhang #endif
885cd620004SJunchao Zhang   PetscFunctionReturn(0);
886cd620004SJunchao Zhang }
887cd620004SJunchao Zhang 
8889fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
889cd620004SJunchao Zhang {
890cd620004SJunchao Zhang   PetscLogDouble flops;
891cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
892cd620004SJunchao Zhang 
893cd620004SJunchao Zhang   PetscFunctionBegin;
89483df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
895cd620004SJunchao Zhang     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
8967fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
897*9566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops)); else
898cd620004SJunchao Zhang #endif
899*9566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(flops));
900cd620004SJunchao Zhang   }
901cd620004SJunchao Zhang   PetscFunctionReturn(0);
902cd620004SJunchao Zhang }
903cd620004SJunchao Zhang 
9049fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
905cd620004SJunchao Zhang {
906cd620004SJunchao Zhang   PetscLogDouble flops;
907cd620004SJunchao Zhang 
908cd620004SJunchao Zhang   PetscFunctionBegin;
90983df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
910cd620004SJunchao Zhang     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
9117fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
912*9566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops)); else
913cd620004SJunchao Zhang #endif
914*9566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(flops));
915cd620004SJunchao Zhang   }
916cd620004SJunchao Zhang   PetscFunctionReturn(0);
917cd620004SJunchao Zhang }
918cd620004SJunchao Zhang 
919cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
9204165533cSJose E. Roman   Input Parameters:
921cd620004SJunchao Zhang   +sf      - The StarForest
922cd620004SJunchao Zhang   .link    - The link
923cd620004SJunchao Zhang   .count   - Number of entries to unpack
924cd620004SJunchao Zhang   .start   - The first index, significent when indices=NULL
925cd620004SJunchao Zhang   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
926cd620004SJunchao Zhang   .buf     - A contiguous buffer to unpack from
927cd620004SJunchao Zhang   -op      - Operation after unpack
928cd620004SJunchao Zhang 
9294165533cSJose E. Roman   Output Parameters:
930cd620004SJunchao Zhang   .data    - The data to unpack to
931cd620004SJunchao Zhang */
9329fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
933cd620004SJunchao Zhang {
934cd620004SJunchao Zhang   PetscFunctionBegin;
935cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
936cd620004SJunchao Zhang   {
937cd620004SJunchao Zhang     PetscInt       i;
938cd620004SJunchao Zhang     if (indices) {
939cd620004SJunchao Zhang       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
940cd620004SJunchao Zhang          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
941cd620004SJunchao Zhang       */
942*9566063dSJacob Faibussowitsch       for (i=0; i<count; i++) PetscCallMPI(MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op));
943cd620004SJunchao Zhang     } else {
944*9566063dSJacob Faibussowitsch       PetscCallMPI(MPIU_Reduce_local(buf,(char*)data+start*link->unitbytes,count,link->unit,op));
945cd620004SJunchao Zhang     }
946cd620004SJunchao Zhang   }
947b458e8f1SJose E. Roman   PetscFunctionReturn(0);
948cd620004SJunchao Zhang #else
949cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
950cd620004SJunchao Zhang #endif
951cd620004SJunchao Zhang }
952cd620004SJunchao Zhang 
9539fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
954cd620004SJunchao Zhang {
955cd620004SJunchao Zhang   PetscFunctionBegin;
956cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
957cd620004SJunchao Zhang   {
958cd620004SJunchao Zhang     PetscInt       i,disp;
959fcc7397dSJunchao Zhang     if (!srcIdx) {
960*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op));
961fcc7397dSJunchao Zhang     } else {
962cd620004SJunchao Zhang       for (i=0; i<count; i++) {
963fcc7397dSJunchao Zhang         disp = dstIdx? dstIdx[i] : dstStart + i;
964*9566063dSJacob Faibussowitsch         PetscCallMPI(MPIU_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op));
965fcc7397dSJunchao Zhang       }
966cd620004SJunchao Zhang     }
967cd620004SJunchao Zhang   }
968b458e8f1SJose E. Roman   PetscFunctionReturn(0);
969cd620004SJunchao Zhang #else
970cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
971cd620004SJunchao Zhang #endif
972cd620004SJunchao Zhang }
973cd620004SJunchao Zhang 
974cd620004SJunchao Zhang /*=============================================================================
975cd620004SJunchao Zhang               Pack/Unpack/Fetch/Scatter routines
976cd620004SJunchao Zhang  ============================================================================*/
977cd620004SJunchao Zhang 
978cd620004SJunchao Zhang /* Pack rootdata to rootbuf
9794165533cSJose E. Roman   Input Parameters:
980cd620004SJunchao Zhang   + sf       - The SF this packing works on.
981cd620004SJunchao Zhang   . link     - It gives the memtype of the roots and also provides root buffer.
982cd620004SJunchao Zhang   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
983cd620004SJunchao Zhang   - rootdata - Where to read the roots.
984cd620004SJunchao Zhang 
985cd620004SJunchao Zhang   Notes:
986cd620004SJunchao Zhang   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
98771438e86SJunchao Zhang   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
988cd620004SJunchao Zhang  */
98971438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
990cd620004SJunchao Zhang {
991cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
992cd620004SJunchao Zhang   PetscInt         count,start;
993fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
994cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
995fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
996fcc7397dSJunchao Zhang 
997cd620004SJunchao Zhang   PetscFunctionBegin;
998*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0));
99971438e86SJunchao Zhang   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1000*9566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices));
1001*9566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetPack(link,rootmtype,&Pack));
1002*9566063dSJacob Faibussowitsch     PetscCall((*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]));
1003cd620004SJunchao Zhang   }
1004*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0));
1005cd620004SJunchao Zhang   PetscFunctionReturn(0);
1006cd620004SJunchao Zhang }
1007cd620004SJunchao Zhang 
1008cd620004SJunchao Zhang /* Pack leafdata to leafbuf */
100971438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1010cd620004SJunchao Zhang {
1011cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1012cd620004SJunchao Zhang   PetscInt         count,start;
1013fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1014cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1015fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1016cd620004SJunchao Zhang 
1017cd620004SJunchao Zhang   PetscFunctionBegin;
1018*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0));
101971438e86SJunchao Zhang   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1020*9566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices));
1021*9566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetPack(link,leafmtype,&Pack));
1022*9566063dSJacob Faibussowitsch     PetscCall((*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]));
1023cd620004SJunchao Zhang   }
1024*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0));
1025cd620004SJunchao Zhang   PetscFunctionReturn(0);
1026cd620004SJunchao Zhang }
1027cd620004SJunchao Zhang 
102871438e86SJunchao Zhang /* Pack rootdata to rootbuf, which are in the same memory space */
102971438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
103071438e86SJunchao Zhang {
103171438e86SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
103271438e86SJunchao Zhang 
103371438e86SJunchao Zhang   PetscFunctionBegin;
103471438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1035*9566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1036*9566063dSJacob Faibussowitsch     if (link->PrePack) PetscCall((*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
103771438e86SJunchao Zhang   }
1038*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0));
1039*9566063dSJacob Faibussowitsch   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf,link,scope,rootdata));
1040*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0));
104171438e86SJunchao Zhang   PetscFunctionReturn(0);
104271438e86SJunchao Zhang }
104371438e86SJunchao Zhang /* Pack leafdata to leafbuf, which are in the same memory space */
104471438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
104571438e86SJunchao Zhang {
104671438e86SJunchao Zhang   PetscFunctionBegin;
104771438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1048*9566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
1049*9566063dSJacob Faibussowitsch     if (link->PrePack) PetscCall((*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT));  /* Used by SF nvshmem */
105071438e86SJunchao Zhang   }
1051*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0));
1052*9566063dSJacob Faibussowitsch   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata));
1053*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0));
105471438e86SJunchao Zhang   PetscFunctionReturn(0);
105571438e86SJunchao Zhang }
105671438e86SJunchao Zhang 
105771438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1058cd620004SJunchao Zhang {
1059cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1060cd620004SJunchao Zhang   PetscInt         count,start;
1061cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1062fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1063cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1064fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1065cd620004SJunchao Zhang 
1066cd620004SJunchao Zhang   PetscFunctionBegin;
106771438e86SJunchao Zhang   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1068*9566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp));
1069cd620004SJunchao Zhang     if (UnpackAndOp) {
1070*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices));
1071*9566063dSJacob Faibussowitsch       PetscCall((*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]));
1072cd620004SJunchao Zhang     } else {
1073*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices));
1074*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op));
1075cd620004SJunchao Zhang     }
1076cd620004SJunchao Zhang   }
1077*9566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op));
1078cd620004SJunchao Zhang   PetscFunctionReturn(0);
1079cd620004SJunchao Zhang }
1080cd620004SJunchao Zhang 
108171438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1082cd620004SJunchao Zhang {
1083cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1084cd620004SJunchao Zhang   PetscInt         count,start;
1085fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1086cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1087fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1088cd620004SJunchao Zhang 
1089cd620004SJunchao Zhang   PetscFunctionBegin;
109071438e86SJunchao Zhang   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1091*9566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp));
1092cd620004SJunchao Zhang     if (UnpackAndOp) {
1093*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices));
1094*9566063dSJacob Faibussowitsch       PetscCall((*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]));
1095cd620004SJunchao Zhang     } else {
1096*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices));
1097*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op));
1098cd620004SJunchao Zhang     }
1099cd620004SJunchao Zhang   }
1100*9566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op));
110171438e86SJunchao Zhang   PetscFunctionReturn(0);
110271438e86SJunchao Zhang }
110371438e86SJunchao Zhang /* Unpack rootbuf to rootdata, which are in the same memory space */
110471438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
110571438e86SJunchao Zhang {
110671438e86SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
110771438e86SJunchao Zhang 
110871438e86SJunchao Zhang   PetscFunctionBegin;
1109*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0));
1110*9566063dSJacob Faibussowitsch   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op));
1111*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0));
111271438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1113*9566063dSJacob Faibussowitsch     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT));  /* Used by SF nvshmem */
1114*9566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
111571438e86SJunchao Zhang   }
1116cd620004SJunchao Zhang   PetscFunctionReturn(0);
1117cd620004SJunchao Zhang }
1118cd620004SJunchao Zhang 
111971438e86SJunchao Zhang /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
112071438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
112171438e86SJunchao Zhang {
112271438e86SJunchao Zhang   PetscFunctionBegin;
1123*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0));
1124*9566063dSJacob Faibussowitsch   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op));
1125*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0));
112671438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1127*9566063dSJacob Faibussowitsch     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF));  /* Used by SF nvshmem */
1128*9566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
112971438e86SJunchao Zhang   }
113071438e86SJunchao Zhang   PetscFunctionReturn(0);
113171438e86SJunchao Zhang }
113271438e86SJunchao Zhang 
113371438e86SJunchao Zhang /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
113471438e86SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op)
1135cd620004SJunchao Zhang {
1136cd620004SJunchao Zhang   const PetscInt     *rootindices = NULL;
1137cd620004SJunchao Zhang   PetscInt           count,start;
1138cd620004SJunchao Zhang   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1139fcc7397dSJunchao Zhang   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1140cd620004SJunchao Zhang   PetscMemType       rootmtype = link->rootmtype;
1141fcc7397dSJunchao Zhang   PetscSFPackOpt     opt = NULL;
1142cd620004SJunchao Zhang 
1143cd620004SJunchao Zhang   PetscFunctionBegin;
1144*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0));
114571438e86SJunchao Zhang   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1146cd620004SJunchao Zhang     /* Do FetchAndOp on rootdata with rootbuf */
1147*9566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp));
1148*9566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices));
1149*9566063dSJacob Faibussowitsch     PetscCall((*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]));
1150cd620004SJunchao Zhang   }
1151*9566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op));
1152*9566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0));
1153cd620004SJunchao Zhang   PetscFunctionReturn(0);
1154cd620004SJunchao Zhang }
1155cd620004SJunchao Zhang 
115671438e86SJunchao Zhang PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op)
1157cd620004SJunchao Zhang {
1158cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1159cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1160cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1161fcc7397dSJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
116271438e86SJunchao Zhang   PetscMemType         rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype;
1163fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
116471438e86SJunchao Zhang   PetscInt             buflen = sf->leafbuflen[PETSCSF_LOCAL];
116571438e86SJunchao Zhang   char                 *srcbuf = NULL,*dstbuf = NULL;
116671438e86SJunchao Zhang   PetscBool            dstdups;
1167cd620004SJunchao Zhang 
1168362febeeSStefano Zampini   PetscFunctionBegin;
116971438e86SJunchao Zhang   if (!buflen) PetscFunctionReturn(0);
117071438e86SJunchao Zhang   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
117171438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
1172*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata));
117371438e86SJunchao Zhang       srcmtype = rootmtype;
117471438e86SJunchao Zhang       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
117571438e86SJunchao Zhang       dstmtype = leafmtype;
117671438e86SJunchao Zhang       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
117771438e86SJunchao Zhang     } else {
1178*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata));
117971438e86SJunchao Zhang       srcmtype = leafmtype;
118071438e86SJunchao Zhang       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
118171438e86SJunchao Zhang       dstmtype = rootmtype;
118271438e86SJunchao Zhang       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
118371438e86SJunchao Zhang     }
1184*9566063dSJacob Faibussowitsch     PetscCall((*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes));
118571438e86SJunchao Zhang     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1186*9566063dSJacob Faibussowitsch     if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link));
118771438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
1188*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op));
1189cd620004SJunchao Zhang     } else {
1190*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op));
119171438e86SJunchao Zhang     }
119271438e86SJunchao Zhang   } else {
119371438e86SJunchao Zhang     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
119471438e86SJunchao Zhang     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
1195*9566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp));
1196cd620004SJunchao Zhang     if (ScatterAndOp) {
1197*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices));
1198*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices));
119971438e86SJunchao Zhang       if (direction == PETSCSF_ROOT2LEAF) {
1200*9566063dSJacob Faibussowitsch         PetscCall((*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata));
1201cd620004SJunchao Zhang       } else {
1202*9566063dSJacob Faibussowitsch         PetscCall((*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata));
120371438e86SJunchao Zhang       }
1204cd620004SJunchao Zhang     } else {
1205*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices));
1206*9566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices));
120771438e86SJunchao Zhang       if (direction == PETSCSF_ROOT2LEAF) {
1208*9566063dSJacob Faibussowitsch         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op));
120971438e86SJunchao Zhang       } else {
1210*9566063dSJacob Faibussowitsch         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op));
1211cd620004SJunchao Zhang       }
1212cd620004SJunchao Zhang     }
121371438e86SJunchao Zhang   }
1214cd620004SJunchao Zhang   PetscFunctionReturn(0);
1215cd620004SJunchao Zhang }
1216cd620004SJunchao Zhang 
1217cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */
1218cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1219cd620004SJunchao Zhang {
1220cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1221cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1222cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1223fcc7397dSJunchao Zhang   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1224cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1225fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1226cd620004SJunchao Zhang 
1227cd620004SJunchao Zhang   PetscFunctionBegin;
1228cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1229cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1230cd620004SJunchao Zhang     /* The local communication has to go through pack and unpack */
1231cd620004SJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1232cd620004SJunchao Zhang   } else {
1233*9566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices));
1234*9566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices));
1235*9566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal));
1236*9566063dSJacob Faibussowitsch     PetscCall((*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate));
1237cd620004SJunchao Zhang   }
123840e23c03SJunchao Zhang   PetscFunctionReturn(0);
123940e23c03SJunchao Zhang }
124040e23c03SJunchao Zhang 
124140e23c03SJunchao Zhang /*
1242cd620004SJunchao Zhang   Create per-rank pack/unpack optimizations based on indice patterns
124340e23c03SJunchao Zhang 
124440e23c03SJunchao Zhang    Input Parameters:
1245fcc7397dSJunchao Zhang   +  n       - Number of destination ranks
1246eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1247b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
124840e23c03SJunchao Zhang 
124940e23c03SJunchao Zhang    Output Parameters:
1250cd620004SJunchao Zhang   +  opt     - Pack optimizations. NULL if no optimizations.
125140e23c03SJunchao Zhang */
1252cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
125340e23c03SJunchao Zhang {
1254fcc7397dSJunchao Zhang   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1255fcc7397dSJunchao Zhang   PetscBool      optimizable = PETSC_TRUE;
125640e23c03SJunchao Zhang   PetscSFPackOpt opt;
125740e23c03SJunchao Zhang 
125840e23c03SJunchao Zhang   PetscFunctionBegin;
1259*9566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1,&opt));
1260*9566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(7*n+2,&opt->array));
1261fcc7397dSJunchao Zhang   opt->n      = opt->array[0] = n;
1262fcc7397dSJunchao Zhang   opt->offset = opt->array + 1;
1263fcc7397dSJunchao Zhang   opt->start  = opt->array + n   + 2;
1264fcc7397dSJunchao Zhang   opt->dx     = opt->array + 2*n + 2;
1265fcc7397dSJunchao Zhang   opt->dy     = opt->array + 3*n + 2;
1266fcc7397dSJunchao Zhang   opt->dz     = opt->array + 4*n + 2;
1267fcc7397dSJunchao Zhang   opt->X      = opt->array + 5*n + 2;
1268fcc7397dSJunchao Zhang   opt->Y      = opt->array + 6*n + 2;
1269fcc7397dSJunchao Zhang 
1270fcc7397dSJunchao Zhang   for (r=0; r<n; r++) { /* For each destination rank */
1271fcc7397dSJunchao Zhang     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1272fcc7397dSJunchao Zhang     p     = offset[r];
1273fcc7397dSJunchao Zhang     start = idx[p]; /* First index for this rank */
1274fcc7397dSJunchao Zhang     p++;
1275fcc7397dSJunchao Zhang 
1276fcc7397dSJunchao Zhang     /* Search in X dimension */
1277fcc7397dSJunchao Zhang     for (dx=1; dx<m; dx++,p++) {
1278fcc7397dSJunchao Zhang       if (start+dx != idx[p]) break;
1279b23bfdefSJunchao Zhang     }
1280b23bfdefSJunchao Zhang 
1281fcc7397dSJunchao Zhang     dydz = m/dx;
1282fcc7397dSJunchao Zhang     X    = dydz > 1 ? (idx[p]-start) : dx;
1283fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1284fcc7397dSJunchao Zhang     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1285fcc7397dSJunchao Zhang     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1286fcc7397dSJunchao Zhang       for (i=0; i<dx; i++,p++) {
1287fcc7397dSJunchao Zhang         if (start+X*dy+i != idx[p]) {
1288fcc7397dSJunchao Zhang           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1289fcc7397dSJunchao Zhang           else goto Z_dimension;
129040e23c03SJunchao Zhang         }
129140e23c03SJunchao Zhang       }
129240e23c03SJunchao Zhang     }
129340e23c03SJunchao Zhang 
1294fcc7397dSJunchao Zhang Z_dimension:
1295fcc7397dSJunchao Zhang     dz = m/(dx*dy);
1296fcc7397dSJunchao Zhang     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1297fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1298fcc7397dSJunchao Zhang     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1299fcc7397dSJunchao Zhang     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1300fcc7397dSJunchao Zhang       for (j=0; j<dy; j++) {
1301fcc7397dSJunchao Zhang         for (i=0; i<dx; i++,p++) {
1302fcc7397dSJunchao Zhang           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
130340e23c03SJunchao Zhang         }
130440e23c03SJunchao Zhang       }
130540e23c03SJunchao Zhang     }
1306fcc7397dSJunchao Zhang     opt->start[r] = start;
1307fcc7397dSJunchao Zhang     opt->dx[r]    = dx;
1308fcc7397dSJunchao Zhang     opt->dy[r]    = dy;
1309fcc7397dSJunchao Zhang     opt->dz[r]    = dz;
1310fcc7397dSJunchao Zhang     opt->X[r]     = X;
1311fcc7397dSJunchao Zhang     opt->Y[r]     = Y;
131240e23c03SJunchao Zhang   }
131340e23c03SJunchao Zhang 
1314fcc7397dSJunchao Zhang finish:
1315fcc7397dSJunchao Zhang   /* If not optimizable, free arrays to save memory */
1316fcc7397dSJunchao Zhang   if (!n || !optimizable) {
1317*9566063dSJacob Faibussowitsch     PetscCall(PetscFree(opt->array));
1318*9566063dSJacob Faibussowitsch     PetscCall(PetscFree(opt));
131940e23c03SJunchao Zhang     *out = NULL;
1320fcc7397dSJunchao Zhang   } else {
1321fcc7397dSJunchao Zhang     opt->offset[0] = 0;
1322fcc7397dSJunchao Zhang     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1323fcc7397dSJunchao Zhang     *out = opt;
1324fcc7397dSJunchao Zhang   }
132540e23c03SJunchao Zhang   PetscFunctionReturn(0);
132640e23c03SJunchao Zhang }
132740e23c03SJunchao Zhang 
13289fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
132940e23c03SJunchao Zhang {
133040e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
133140e23c03SJunchao Zhang 
133240e23c03SJunchao Zhang   PetscFunctionBegin;
133340e23c03SJunchao Zhang   if (opt) {
1334*9566063dSJacob Faibussowitsch     PetscCall(PetscSFFree(sf,mtype,opt->array));
1335*9566063dSJacob Faibussowitsch     PetscCall(PetscFree(opt));
133640e23c03SJunchao Zhang     *out = NULL;
133740e23c03SJunchao Zhang   }
133840e23c03SJunchao Zhang   PetscFunctionReturn(0);
133940e23c03SJunchao Zhang }
1340cd620004SJunchao Zhang 
1341cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1342cd620004SJunchao Zhang {
1343cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1344cd620004SJunchao Zhang   PetscInt       i,j;
1345cd620004SJunchao Zhang 
1346cd620004SJunchao Zhang   PetscFunctionBegin;
1347cd620004SJunchao Zhang   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1348cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Set defaults */
1349cd620004SJunchao Zhang     sf->leafstart[i]   = 0;
1350cd620004SJunchao Zhang     sf->leafcontig[i]  = PETSC_TRUE;
1351cd620004SJunchao Zhang     sf->leafdups[i]    = PETSC_FALSE;
1352cd620004SJunchao Zhang     bas->rootstart[i]  = 0;
1353cd620004SJunchao Zhang     bas->rootcontig[i] = PETSC_TRUE;
1354cd620004SJunchao Zhang     bas->rootdups[i]   = PETSC_FALSE;
1355cd620004SJunchao Zhang   }
1356cd620004SJunchao Zhang 
1357cd620004SJunchao Zhang   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1358cd620004SJunchao Zhang   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1359cd620004SJunchao Zhang 
1360cd620004SJunchao Zhang   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1361cd620004SJunchao Zhang   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1362cd620004SJunchao Zhang 
1363cd620004SJunchao Zhang   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1364cd620004SJunchao Zhang   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1365cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1366cd620004SJunchao Zhang   }
1367cd620004SJunchao Zhang   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1368cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1369cd620004SJunchao Zhang   }
1370cd620004SJunchao Zhang 
1371cd620004SJunchao Zhang   /* If not, see if we can have per-rank optimizations by doing index analysis */
1372*9566063dSJacob Faibussowitsch   if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]));
1373*9566063dSJacob Faibussowitsch   if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]));
1374cd620004SJunchao Zhang 
1375cd620004SJunchao Zhang   /* Are root indices for self and remote contiguous? */
1376cd620004SJunchao Zhang   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1377cd620004SJunchao Zhang   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1378cd620004SJunchao Zhang 
1379cd620004SJunchao Zhang   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1380cd620004SJunchao Zhang   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1381cd620004SJunchao Zhang 
1382cd620004SJunchao Zhang   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1383cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1384cd620004SJunchao Zhang   }
1385cd620004SJunchao Zhang   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1386cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1387cd620004SJunchao Zhang   }
1388cd620004SJunchao Zhang 
1389*9566063dSJacob Faibussowitsch   if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]));
1390*9566063dSJacob Faibussowitsch   if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]));
1391cd620004SJunchao Zhang 
1392cd620004SJunchao Zhang     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1393013b3241SStefano Zampini   if (PetscDefined(HAVE_DEVICE)) {
1394013b3241SStefano Zampini     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1395*9566063dSJacob Faibussowitsch     if (!sf->leafcontig[0]  && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]));
1396*9566063dSJacob Faibussowitsch     if (!sf->leafcontig[1]  && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]));
1397*9566063dSJacob Faibussowitsch     if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]));
1398*9566063dSJacob Faibussowitsch     if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]));
1399013b3241SStefano Zampini   }
1400cd620004SJunchao Zhang   PetscFunctionReturn(0);
1401cd620004SJunchao Zhang }
1402cd620004SJunchao Zhang 
1403cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1404cd620004SJunchao Zhang {
1405cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1406cd620004SJunchao Zhang   PetscInt       i;
1407cd620004SJunchao Zhang 
1408cd620004SJunchao Zhang   PetscFunctionBegin;
1409cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1410*9566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]));
1411*9566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]));
14127fd2d3dbSJunchao Zhang    #if defined(PETSC_HAVE_DEVICE)
1413*9566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]));
1414*9566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]));
14157fd2d3dbSJunchao Zhang    #endif
1416cd620004SJunchao Zhang   }
1417cd620004SJunchao Zhang   PetscFunctionReturn(0);
1418cd620004SJunchao Zhang }
1419