xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 4165533ce9fbcd1440639b2ad505faa054a49d0c)
120c24465SJunchao Zhang #include "petsc/private/sfimpl.h"
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
57fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
67fd2d3dbSJunchao Zhang 
740e23c03SJunchao Zhang /*
840e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
940e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1040e23c03SJunchao Zhang  */
1140e23c03SJunchao Zhang 
12cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
1340e23c03SJunchao Zhang 
14cd620004SJunchao Zhang /* Operations working like s += t */
15cd620004SJunchao Zhang #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
16cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
17cd620004SJunchao Zhang #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
18cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
19cd620004SJunchao Zhang /* Ref MPI MAXLOC */
20cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \
21cd620004SJunchao Zhang   do {                                       \
22cd620004SJunchao Zhang     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
23cd620004SJunchao Zhang     else if (!((s).u op (t).u)) s = t;           \
24cd620004SJunchao Zhang   } while (0)
2540e23c03SJunchao Zhang 
2640e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
2740e23c03SJunchao Zhang 
2840e23c03SJunchao Zhang    Arguments of the macro:
29b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30fcc7397dSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bsz.
31b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
3240e23c03SJunchao Zhang 
3340e23c03SJunchao Zhang    Arguments of the Pack routine:
34cd620004SJunchao Zhang    +count     Number of indices in idx[].
35fcc7397dSJunchao Zhang    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
36fcc7397dSJunchao Zhang    .opt       Per-pack optimization plan. NULL means no such plan.
37fcc7397dSJunchao Zhang    .idx       Indices of entries to packed.
38eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39cd620004SJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
40cd620004SJunchao Zhang    -packed    Address of the packed data.
4140e23c03SJunchao Zhang  */
42b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \
43fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
44b23bfdefSJunchao Zhang   {                                                                                                          \
4540e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
46b23bfdefSJunchao Zhang     const Type     *u = (const Type*)unpacked,*u2;                                                           \
47b23bfdefSJunchao Zhang     Type           *p = (Type*)packed,*p2;                                                                   \
48fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
49fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
50b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
5140e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
52fcc7397dSJunchao Zhang     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
53fcc7397dSJunchao Zhang     else if (opt) { /* has optimizations available */                                                        \
54fcc7397dSJunchao Zhang       p2 = p;                                                                                                \
55fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
56fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
57fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
58fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
59fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
60fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
61fcc7397dSJunchao Zhang             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
62fcc7397dSJunchao Zhang             p2  += opt->dx[r]*MBS;                                                                           \
63fcc7397dSJunchao Zhang           }                                                                                                  \
64fcc7397dSJunchao Zhang       }                                                                                                      \
65fcc7397dSJunchao Zhang     } else {                                                                                                 \
66b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
67eb02082bSJunchao Zhang         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
68eb02082bSJunchao Zhang           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
69b23bfdefSJunchao Zhang             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
7040e23c03SJunchao Zhang     }                                                                                                        \
7140e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
7240e23c03SJunchao Zhang   }
7340e23c03SJunchao Zhang 
74cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
75cd620004SJunchao Zhang                 and inserts into a sparse array.
7640e23c03SJunchao Zhang 
7740e23c03SJunchao Zhang    Arguments:
78b23bfdefSJunchao Zhang   .Type       Type of the data
7940e23c03SJunchao Zhang   .BS         Block size for vectorization
80b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
8140e23c03SJunchao Zhang 
8240e23c03SJunchao Zhang   Notes:
8340e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
8440e23c03SJunchao Zhang  */
85cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ)               \
86fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
87b23bfdefSJunchao Zhang   {                                                                                                          \
8840e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
89b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
90fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
91fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
92fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
93b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
9440e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
95b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
96fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
97fcc7397dSJunchao Zhang       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
98fcc7397dSJunchao Zhang     } else if (opt) { /* has optimizations available */                                                      \
99fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
100fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
101fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
102fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
103fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
104fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
105fcc7397dSJunchao Zhang             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
106fcc7397dSJunchao Zhang             p   += opt->dx[r]*MBS;                                                                           \
107fcc7397dSJunchao Zhang           }                                                                                                  \
108fcc7397dSJunchao Zhang       }                                                                                                      \
109fcc7397dSJunchao Zhang     } else {                                                                                                 \
110b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
111b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
112cd620004SJunchao Zhang           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
11340e23c03SJunchao Zhang     }                                                                                                        \
11440e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
11540e23c03SJunchao Zhang   }
11640e23c03SJunchao Zhang 
117cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
11840e23c03SJunchao Zhang 
11940e23c03SJunchao Zhang    Arguments:
120cd620004SJunchao Zhang   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
121b23bfdefSJunchao Zhang   .Type       Type of the data
12240e23c03SJunchao Zhang   .BS         Block size for vectorization
123b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
124cd620004SJunchao Zhang   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
125cd620004SJunchao Zhang   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
12640e23c03SJunchao Zhang  */
127cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
128fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
129b23bfdefSJunchao Zhang   {                                                                                                          \
130cd620004SJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
131fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
132fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
133fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
134b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
13540e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
136b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
137fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
138cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
139cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
140cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
141cd620004SJunchao Zhang             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
142fcc7397dSJunchao Zhang     } else if (opt) { /* idx[] has patterns */                                                               \
143fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
144fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
145fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
146fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
147fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
148fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
149fcc7397dSJunchao Zhang             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
150fcc7397dSJunchao Zhang             p += opt->dx[r]*MBS;                                                                             \
151fcc7397dSJunchao Zhang           }                                                                                                  \
152fcc7397dSJunchao Zhang       }                                                                                                      \
153fcc7397dSJunchao Zhang     } else {                                                                                                 \
154cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
155cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
156cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
157cd620004SJunchao Zhang             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
158cd620004SJunchao Zhang     }                                                                                                        \
159cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
160cd620004SJunchao Zhang   }
161cd620004SJunchao Zhang 
162cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
163fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
164cd620004SJunchao Zhang   {                                                                                                          \
165fcc7397dSJunchao Zhang     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
166fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
167fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
168fcc7397dSJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
169cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
170fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
171fcc7397dSJunchao Zhang       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
172fcc7397dSJunchao Zhang       l = i*MBS;                                                                                             \
173b23bfdefSJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
174b23bfdefSJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
175fcc7397dSJunchao Zhang           tmp = u[r+j*BS+k];                                                                                 \
176fcc7397dSJunchao Zhang           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
177fcc7397dSJunchao Zhang           p[l+j*BS+k] = tmp;                                                                                 \
178cd620004SJunchao Zhang         }                                                                                                    \
179cd620004SJunchao Zhang     }                                                                                                        \
180cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
181cd620004SJunchao Zhang   }
182cd620004SJunchao Zhang 
183cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
184fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
185cd620004SJunchao Zhang   {                                                                                                          \
186fcc7397dSJunchao Zhang     PetscErrorCode ierr;                                                                                     \
187fcc7397dSJunchao Zhang     const Type     *u = (const Type*)src;                                                                    \
188fcc7397dSJunchao Zhang     Type           *v = (Type*)dst;                                                                          \
189fcc7397dSJunchao Zhang     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
190cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
191cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
192cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
193fcc7397dSJunchao Zhang     if (!srcIdx) { /* src is contiguous */                                                                   \
194fcc7397dSJunchao Zhang       u += srcStart*MBS;                                                                                     \
195fcc7397dSJunchao Zhang       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
196fcc7397dSJunchao Zhang     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
197fcc7397dSJunchao Zhang       u += srcOpt->start[0]*MBS;                                                                             \
198fcc7397dSJunchao Zhang       v += dstStart*MBS;                                                                                     \
199fcc7397dSJunchao Zhang       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
200fcc7397dSJunchao Zhang       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
201fcc7397dSJunchao Zhang         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
202fcc7397dSJunchao Zhang           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
203fcc7397dSJunchao Zhang           v += srcOpt->dx[0]*MBS;                                                                            \
204fcc7397dSJunchao Zhang         }                                                                                                    \
205fcc7397dSJunchao Zhang     } else { /* all other cases */                                                                           \
206fcc7397dSJunchao Zhang       for (i=0; i<count; i++) {                                                                              \
207fcc7397dSJunchao Zhang         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
208fcc7397dSJunchao Zhang         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
209cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
210fcc7397dSJunchao Zhang           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
211fcc7397dSJunchao Zhang       }                                                                                                      \
212cd620004SJunchao Zhang     }                                                                                                        \
213cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
214cd620004SJunchao Zhang   }
215cd620004SJunchao Zhang 
216cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
217fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
218cd620004SJunchao Zhang   {                                                                                                          \
219fcc7397dSJunchao Zhang     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
220fcc7397dSJunchao Zhang     const Type     *ldata = (const Type*)leafdata;                                                           \
221fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
222cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
223cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
224cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
225fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
226fcc7397dSJunchao Zhang       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
227fcc7397dSJunchao Zhang       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
228cd620004SJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
229cd620004SJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
230fcc7397dSJunchao Zhang           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
231fcc7397dSJunchao Zhang           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
23240e23c03SJunchao Zhang         }                                                                                                    \
23340e23c03SJunchao Zhang     }                                                                                                        \
23440e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
23540e23c03SJunchao Zhang   }
23640e23c03SJunchao Zhang 
237b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
238b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ)                                                      \
239b23bfdefSJunchao Zhang   DEF_PackFunc(Type,BS,EQ)                                                        \
240cd620004SJunchao Zhang   DEF_UnpackFunc(Type,BS,EQ)                                                      \
241cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
242cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
243eb02082bSJunchao Zhang     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
244eb02082bSJunchao Zhang     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
245cd620004SJunchao Zhang     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
24640e23c03SJunchao Zhang   }
24740e23c03SJunchao Zhang 
248b23bfdefSJunchao Zhang /* Add, Mult ops */
249b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ)                                                       \
250cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
251cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
252cd620004SJunchao Zhang   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
253cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
254cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
255cd620004SJunchao Zhang   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
256cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
257eb02082bSJunchao Zhang     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
258eb02082bSJunchao Zhang     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
259eb02082bSJunchao Zhang     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
260cd620004SJunchao Zhang     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
261cd620004SJunchao Zhang     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
262cd620004SJunchao Zhang     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
26340e23c03SJunchao Zhang   }
26440e23c03SJunchao Zhang 
265b23bfdefSJunchao Zhang /* Max, Min ops */
266b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ)                                                       \
267cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
268cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
269cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
270cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
271cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
272eb02082bSJunchao Zhang     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
273eb02082bSJunchao Zhang     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
274cd620004SJunchao Zhang     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
275cd620004SJunchao Zhang     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
276b23bfdefSJunchao Zhang   }
277b23bfdefSJunchao Zhang 
278b23bfdefSJunchao Zhang /* Logical ops.
279cd620004SJunchao Zhang   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
28040e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
28140e23c03SJunchao Zhang  */
282b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ)                                                       \
283cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
284cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
285cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
286cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
287cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
288cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
289cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
290eb02082bSJunchao Zhang     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
291eb02082bSJunchao Zhang     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
292eb02082bSJunchao Zhang     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
293cd620004SJunchao Zhang     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
294cd620004SJunchao Zhang     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
295cd620004SJunchao Zhang     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
29640e23c03SJunchao Zhang   }
29740e23c03SJunchao Zhang 
298b23bfdefSJunchao Zhang /* Bitwise ops */
299b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ)                                                       \
300cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
301cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
302cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
303cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
304cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
305cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
306cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
307eb02082bSJunchao Zhang     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
308eb02082bSJunchao Zhang     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
309eb02082bSJunchao Zhang     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
310cd620004SJunchao Zhang     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
311cd620004SJunchao Zhang     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
312cd620004SJunchao Zhang     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
31340e23c03SJunchao Zhang   }
31440e23c03SJunchao Zhang 
315cd620004SJunchao Zhang /* Maxloc, Minloc ops */
316cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ)                                                      \
317cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
318cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
319cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
320cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
321cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
322cd620004SJunchao Zhang     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
323cd620004SJunchao Zhang     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
324cd620004SJunchao Zhang     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
325cd620004SJunchao Zhang     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
32640e23c03SJunchao Zhang   }
32740e23c03SJunchao Zhang 
328b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ)                                               \
329b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
330b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
331b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
332b23bfdefSJunchao Zhang   DEF_Log(Type,BS,EQ)                                                             \
333b23bfdefSJunchao Zhang   DEF_Bit(Type,BS,EQ)                                                             \
334cd620004SJunchao Zhang   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
335b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
336b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
337b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
338b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
339b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
34040e23c03SJunchao Zhang   }
34140e23c03SJunchao Zhang 
342b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ)                                                  \
343b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
344b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
345b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
346cd620004SJunchao Zhang   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
347b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
348b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
349b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
350b23bfdefSJunchao Zhang   }
35140e23c03SJunchao Zhang 
35240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
353b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ)                                               \
354b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
355b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
356cd620004SJunchao Zhang   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
357b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
358b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
359b23bfdefSJunchao Zhang   }
36040e23c03SJunchao Zhang #endif
361b23bfdefSJunchao Zhang 
362b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ)                                                  \
363b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
364cd620004SJunchao Zhang   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
365b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366b23bfdefSJunchao Zhang   }
367b23bfdefSJunchao Zhang 
368b23bfdefSJunchao Zhang /* Maxloc, Minloc */
369cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ)                                                  \
370cd620004SJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
371cd620004SJunchao Zhang   DEF_Xloc(Type,BS,EQ)                                                            \
372cd620004SJunchao Zhang   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
373cd620004SJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374cd620004SJunchao Zhang     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
375b23bfdefSJunchao Zhang   }
376b23bfdefSJunchao Zhang 
377b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
378b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
379b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
380b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
381b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
382b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
383b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
384b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
385b23bfdefSJunchao Zhang 
386b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
387b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1)
388b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1)
389b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1)
390b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1)
391b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0)
392b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0)
393b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0)
394b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0)
395b23bfdefSJunchao Zhang #endif
396b23bfdefSJunchao Zhang 
397b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */
398b23bfdefSJunchao Zhang typedef signed char SignedChar;
399b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1)
400b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1)
401b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1)
402b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1)
403b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0)
404b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0)
405b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0)
406b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0)
407b23bfdefSJunchao Zhang 
408b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar;
409b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1)
410b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1)
411b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1)
412b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1)
413b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0)
414b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0)
415b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0)
416b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0)
417b23bfdefSJunchao Zhang 
418b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1)
419b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1)
420b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1)
421b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1)
422b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0)
423b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0)
424b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0)
425b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0)
426b23bfdefSJunchao Zhang 
427b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
428b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1)
429b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1)
430b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1)
431b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1)
432b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0)
433b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0)
434b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0)
435b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0)
436b23bfdefSJunchao Zhang #endif
437b23bfdefSJunchao Zhang 
438cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2
439cd620004SJunchao Zhang typedef struct {int u; int i;}           PairType(int,int);
440cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
441cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1)
442cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1)
443b23bfdefSJunchao Zhang 
444b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */
445b23bfdefSJunchao Zhang DEF_DumbType(char,1,1)
446b23bfdefSJunchao Zhang DEF_DumbType(char,2,1)
447b23bfdefSJunchao Zhang DEF_DumbType(char,4,1)
448b23bfdefSJunchao Zhang DEF_DumbType(char,1,0)
449b23bfdefSJunchao Zhang DEF_DumbType(char,2,0)
450b23bfdefSJunchao Zhang DEF_DumbType(char,4,0)
451b23bfdefSJunchao Zhang 
452eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
453b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1)
454b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1)
455b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1)
456b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1)
457b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0)
458b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0)
459b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0)
460b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0)
46140e23c03SJunchao Zhang 
46240e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
46340e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
46440e23c03SJunchao Zhang {
46540e23c03SJunchao Zhang   int ierr;
46640e23c03SJunchao Zhang   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
46740e23c03SJunchao Zhang   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
46840e23c03SJunchao Zhang   return MPI_SUCCESS;
46940e23c03SJunchao Zhang }
47040e23c03SJunchao Zhang #endif
47140e23c03SJunchao Zhang 
47271438e86SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)
47340e23c03SJunchao Zhang {
47440e23c03SJunchao Zhang   PetscErrorCode    ierr;
475cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
47671438e86SJunchao Zhang   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
47771438e86SJunchao Zhang 
47871438e86SJunchao Zhang   PetscFunctionBegin;
47971438e86SJunchao Zhang   /* Destroy device-specific fields */
48071438e86SJunchao Zhang   if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);}
48171438e86SJunchao Zhang 
48271438e86SJunchao Zhang   /* Destroy host related fields */
48371438e86SJunchao Zhang   if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);}
48471438e86SJunchao Zhang   if (!link->use_nvshmem) {
48571438e86SJunchao Zhang     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
48671438e86SJunchao Zhang       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);}
48771438e86SJunchao Zhang     }
48871438e86SJunchao Zhang     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
48971438e86SJunchao Zhang     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
49071438e86SJunchao Zhang       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
49171438e86SJunchao Zhang       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
49271438e86SJunchao Zhang     }
49371438e86SJunchao Zhang   }
49471438e86SJunchao Zhang   ierr = PetscFree(link);CHKERRQ(ierr);
49571438e86SJunchao Zhang   PetscFunctionReturn(0);
49671438e86SJunchao Zhang }
49771438e86SJunchao Zhang 
49871438e86SJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
49971438e86SJunchao Zhang {
50071438e86SJunchao Zhang   PetscErrorCode    ierr;
501cd620004SJunchao Zhang 
502cd620004SJunchao Zhang   PetscFunctionBegin;
503cd620004SJunchao Zhang   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
50471438e86SJunchao Zhang  #if defined(PETSC_HAVE_NVSHMEM)
50571438e86SJunchao Zhang   {
50671438e86SJunchao Zhang     PetscBool use_nvshmem;
50771438e86SJunchao Zhang     ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr);
50871438e86SJunchao Zhang     if (use_nvshmem) {
50971438e86SJunchao Zhang       ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
51071438e86SJunchao Zhang       PetscFunctionReturn(0);
511cd620004SJunchao Zhang     }
512cd620004SJunchao Zhang   }
5137fd2d3dbSJunchao Zhang  #endif
51471438e86SJunchao Zhang   ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
515cd620004SJunchao Zhang   PetscFunctionReturn(0);
516cd620004SJunchao Zhang }
517cd620004SJunchao Zhang 
518cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
519cd620004SJunchao Zhang    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
520cd620004SJunchao Zhang */
521cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
522cd620004SJunchao Zhang {
523cd620004SJunchao Zhang   PetscErrorCode       ierr;
524cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
525cd620004SJunchao Zhang   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
526cd620004SJunchao Zhang   const PetscInt       *rootoffset,*leafoffset;
527cd620004SJunchao Zhang   PetscMPIInt          n;
528cd620004SJunchao Zhang   MPI_Aint             disp;
529cd620004SJunchao Zhang   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
530cd620004SJunchao Zhang   MPI_Datatype         unit = link->unit;
531cd620004SJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
532cd620004SJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
533cd620004SJunchao Zhang 
534cd620004SJunchao Zhang   PetscFunctionBegin;
535cd620004SJunchao Zhang   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
536cd620004SJunchao Zhang   if (sf->persistent) {
537cd620004SJunchao Zhang     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
538cd620004SJunchao Zhang       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
539cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
540cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
541cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
542cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
543ffc4695bSBarry Smith           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
544cd620004SJunchao Zhang         }
545cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
546cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
547cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
548cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
549ffc4695bSBarry Smith           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
550cd620004SJunchao Zhang         }
551cd620004SJunchao Zhang       }
552cd620004SJunchao Zhang       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
553cd620004SJunchao Zhang     }
554cd620004SJunchao Zhang 
555cd620004SJunchao Zhang     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
556cd620004SJunchao Zhang       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
557cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
558cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
559cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
560cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
561ffc4695bSBarry Smith           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
562cd620004SJunchao Zhang         }
563cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
564cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
565cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
566cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
567ffc4695bSBarry Smith           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
568cd620004SJunchao Zhang         }
569cd620004SJunchao Zhang       }
570cd620004SJunchao Zhang       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
571cd620004SJunchao Zhang     }
572cd620004SJunchao Zhang   }
573cd620004SJunchao Zhang   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
574cd620004SJunchao Zhang   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
575cd620004SJunchao Zhang   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
576cd620004SJunchao Zhang   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
577cd620004SJunchao Zhang   PetscFunctionReturn(0);
578cd620004SJunchao Zhang }
579cd620004SJunchao Zhang 
580cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
581cd620004SJunchao Zhang {
582cd620004SJunchao Zhang   PetscErrorCode    ierr;
583cd620004SJunchao Zhang   PetscSFLink       link,*p;
58440e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
58540e23c03SJunchao Zhang 
58640e23c03SJunchao Zhang   PetscFunctionBegin;
58740e23c03SJunchao Zhang   /* Look for types in cache */
58840e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
58940e23c03SJunchao Zhang     PetscBool match;
59040e23c03SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
591637e6665SJunchao Zhang     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
59240e23c03SJunchao Zhang       switch (cmode) {
59340e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
59440e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
59540e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
59640e23c03SJunchao Zhang       }
59740e23c03SJunchao Zhang       *mylink = link;
59840e23c03SJunchao Zhang       PetscFunctionReturn(0);
59940e23c03SJunchao Zhang     }
60040e23c03SJunchao Zhang   }
60140e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
60240e23c03SJunchao Zhang }
60340e23c03SJunchao Zhang 
60471438e86SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink)
60540e23c03SJunchao Zhang {
60640e23c03SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
60771438e86SJunchao Zhang   PetscSFLink       link = *mylink;
60840e23c03SJunchao Zhang 
60940e23c03SJunchao Zhang   PetscFunctionBegin;
61071438e86SJunchao Zhang   link->rootdata = NULL;
61171438e86SJunchao Zhang   link->leafdata = NULL;
61271438e86SJunchao Zhang   link->next     = bas->avail;
61371438e86SJunchao Zhang   bas->avail     = link;
61471438e86SJunchao Zhang   *mylink        = NULL;
615eb02082bSJunchao Zhang   PetscFunctionReturn(0);
616eb02082bSJunchao Zhang }
617eb02082bSJunchao Zhang 
6189d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
619cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
6209d1c8addSJunchao Zhang {
6219d1c8addSJunchao Zhang   PetscErrorCode    ierr;
622cd620004SJunchao Zhang   PetscSFLink       link,*p;
6239d1c8addSJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
6249d1c8addSJunchao Zhang   PetscBool         match;
6259d1c8addSJunchao Zhang 
6269d1c8addSJunchao Zhang   PetscFunctionBegin;
627b458e8f1SJose E. Roman   if (PetscDefined(USE_DEBUG)) {
62818fb5014SJunchao Zhang     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
62918fb5014SJunchao Zhang        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
63018fb5014SJunchao Zhang     */
631637e6665SJunchao Zhang     if (rootdata || leafdata) {
6329d1c8addSJunchao Zhang       for (p=&bas->inuse; (link=*p); p=&link->next) {
6339d1c8addSJunchao Zhang         ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
634cd620004SJunchao Zhang         if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
6359d1c8addSJunchao Zhang       }
63618fb5014SJunchao Zhang     }
637b458e8f1SJose E. Roman   }
6389d1c8addSJunchao Zhang   PetscFunctionReturn(0);
6399d1c8addSJunchao Zhang }
6409d1c8addSJunchao Zhang 
64120c24465SJunchao Zhang static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
64220c24465SJunchao Zhang {
64320c24465SJunchao Zhang   PetscFunctionBegin;
64420c24465SJunchao Zhang   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
64520c24465SJunchao Zhang   PetscFunctionReturn(0);
64620c24465SJunchao Zhang }
64720c24465SJunchao Zhang 
648cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
64940e23c03SJunchao Zhang {
65040e23c03SJunchao Zhang   PetscErrorCode ierr;
651b23bfdefSJunchao Zhang   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
652b23bfdefSJunchao Zhang   PetscBool      is2Int,is2PetscInt;
65340e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
65440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
655b23bfdefSJunchao Zhang   PetscInt       nPetscComplex=0;
65640e23c03SJunchao Zhang #endif
65740e23c03SJunchao Zhang 
65840e23c03SJunchao Zhang   PetscFunctionBegin;
659b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
660b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
661b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
662b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
663b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
664b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
66540e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
666b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
66740e23c03SJunchao Zhang #endif
66840e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
66940e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
670b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
671ffc4695bSBarry Smith   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr);
6725ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
673b23bfdefSJunchao Zhang   link->bs = 1; /* default */
67440e23c03SJunchao Zhang 
675eb02082bSJunchao Zhang   if (is2Int) {
676cd620004SJunchao Zhang     PackInit_PairType_int_int_1_1(link);
677eb02082bSJunchao Zhang     link->bs        = 1;
678eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(int);
6795ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
680eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
6815ad15460SJunchao Zhang     link->unit      = MPI_2INT;
682eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
683cd620004SJunchao Zhang     PackInit_PairType_PetscInt_PetscInt_1_1(link);
684eb02082bSJunchao Zhang     link->bs        = 1;
685eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(PetscInt);
686eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
6875ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
6885ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
689eb02082bSJunchao Zhang   } else if (nPetscReal) {
690b23bfdefSJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
691b23bfdefSJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
692b23bfdefSJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
693b23bfdefSJunchao Zhang     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
694b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
695eb02082bSJunchao Zhang     link->unitbytes = nPetscReal*sizeof(PetscReal);
69640e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
6975ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
698b23bfdefSJunchao Zhang   } else if (nPetscInt) {
699b23bfdefSJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
700b23bfdefSJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
701b23bfdefSJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
702b23bfdefSJunchao Zhang     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
703b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
704eb02082bSJunchao Zhang     link->unitbytes = nPetscInt*sizeof(PetscInt);
705b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
7065ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
707b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
708b23bfdefSJunchao Zhang   } else if (nInt) {
709b23bfdefSJunchao Zhang     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
710b23bfdefSJunchao Zhang     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
711b23bfdefSJunchao Zhang     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
712b23bfdefSJunchao Zhang     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
713b23bfdefSJunchao Zhang     link->bs        = nInt;
714eb02082bSJunchao Zhang     link->unitbytes = nInt*sizeof(int);
715b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
7165ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
717b23bfdefSJunchao Zhang #endif
718b23bfdefSJunchao Zhang   } else if (nSignedChar) {
719b23bfdefSJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
720b23bfdefSJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
721b23bfdefSJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
722b23bfdefSJunchao Zhang     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
723b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
724eb02082bSJunchao Zhang     link->unitbytes = nSignedChar*sizeof(SignedChar);
725b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
7265ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
727b23bfdefSJunchao Zhang   }  else if (nUnsignedChar) {
728b23bfdefSJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
729b23bfdefSJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
730b23bfdefSJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
731b23bfdefSJunchao Zhang     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
732b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
733eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
734b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
7355ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
73640e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
737b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
738b23bfdefSJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
739b23bfdefSJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
740b23bfdefSJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
741b23bfdefSJunchao Zhang     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
742b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
743eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
74440e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
7455ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
74640e23c03SJunchao Zhang #endif
74740e23c03SJunchao Zhang   } else {
748b23bfdefSJunchao Zhang     MPI_Aint lb,nbyte;
749ffc4695bSBarry Smith     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr);
75040e23c03SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
751eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
752eb02082bSJunchao Zhang       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
753eb02082bSJunchao Zhang       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
754eb02082bSJunchao Zhang       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
755eb02082bSJunchao Zhang       link->bs        = nbyte;
756b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
757b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
75840e23c03SJunchao Zhang     } else {
759eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
760eb02082bSJunchao Zhang       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
761eb02082bSJunchao Zhang       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
762eb02082bSJunchao Zhang       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
763eb02082bSJunchao Zhang       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
764eb02082bSJunchao Zhang       link->bs        = nInt;
765b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
76640e23c03SJunchao Zhang       link->basicunit = MPI_INT;
76740e23c03SJunchao Zhang     }
7685ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
76940e23c03SJunchao Zhang   }
770b23bfdefSJunchao Zhang 
771ffc4695bSBarry Smith   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);}
77220c24465SJunchao Zhang 
77320c24465SJunchao Zhang   link->Memcpy = PetscSFLinkMemcpy_Host;
77440e23c03SJunchao Zhang   PetscFunctionReturn(0);
77540e23c03SJunchao Zhang }
77640e23c03SJunchao Zhang 
777fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
77840e23c03SJunchao Zhang {
77940e23c03SJunchao Zhang   PetscFunctionBegin;
78040e23c03SJunchao Zhang   *UnpackAndOp = NULL;
78171438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {
78283df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *UnpackAndOp = link->h_UnpackAndInsert;
783eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
784eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
785eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
786eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
787eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
788eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
789eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
790eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
791eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
792eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
793eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
794eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
795eb02082bSJunchao Zhang   }
7967fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
79771438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) {
79883df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *UnpackAndOp = link->d_UnpackAndInsert;
799eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
800eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
801eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
802eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
803eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
804eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
805eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
806eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
807eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
808eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
809eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
810eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
81171438e86SJunchao Zhang   } else if (PetscMemTypeDevice(mtype) && atomic) {
81283df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *UnpackAndOp = link->da_UnpackAndInsert;
813eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
814eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
815eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
816eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
817eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
818eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
819eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
820eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
821eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
822eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
823eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
824eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
825eb02082bSJunchao Zhang   }
826eb02082bSJunchao Zhang #endif
82740e23c03SJunchao Zhang   PetscFunctionReturn(0);
82840e23c03SJunchao Zhang }
82940e23c03SJunchao Zhang 
830fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
83140e23c03SJunchao Zhang {
83240e23c03SJunchao Zhang   PetscFunctionBegin;
833cd620004SJunchao Zhang   *ScatterAndOp = NULL;
83471438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {
83583df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *ScatterAndOp = link->h_ScatterAndInsert;
836cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
837cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
838cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
839cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
840cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
841cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
842cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
843cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
844cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
845cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
846cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
847cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
848eb02082bSJunchao Zhang   }
8497fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
85071438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) {
85183df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *ScatterAndOp = link->d_ScatterAndInsert;
852cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
853cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
854cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
855cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
856cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
857cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
858cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
859cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
860cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
861cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
862cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
863cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
86471438e86SJunchao Zhang   } else if (PetscMemTypeDevice(mtype) && atomic) {
86583df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *ScatterAndOp = link->da_ScatterAndInsert;
866cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
867cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
868cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
869cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
870cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
871cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
872cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
873cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
874cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
875cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
876cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
877cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
878eb02082bSJunchao Zhang   }
879eb02082bSJunchao Zhang #endif
880cd620004SJunchao Zhang   PetscFunctionReturn(0);
881cd620004SJunchao Zhang }
882cd620004SJunchao Zhang 
883fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
884cd620004SJunchao Zhang {
885cd620004SJunchao Zhang   PetscFunctionBegin;
886cd620004SJunchao Zhang   *FetchAndOp = NULL;
887cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
88871438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
8897fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
89071438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
89171438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOp = link->da_FetchAndAdd;
892cd620004SJunchao Zhang #endif
893cd620004SJunchao Zhang   PetscFunctionReturn(0);
894cd620004SJunchao Zhang }
895cd620004SJunchao Zhang 
896fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
897cd620004SJunchao Zhang {
898cd620004SJunchao Zhang   PetscFunctionBegin;
899cd620004SJunchao Zhang   *FetchAndOpLocal = NULL;
900cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
90171438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
9027fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
90371438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
90471438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
905cd620004SJunchao Zhang #endif
906cd620004SJunchao Zhang   PetscFunctionReturn(0);
907cd620004SJunchao Zhang }
908cd620004SJunchao Zhang 
909cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
910cd620004SJunchao Zhang {
911cd620004SJunchao Zhang   PetscErrorCode ierr;
912cd620004SJunchao Zhang   PetscLogDouble flops;
913cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
914cd620004SJunchao Zhang 
915cd620004SJunchao Zhang   PetscFunctionBegin;
91683df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
917cd620004SJunchao Zhang     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
9187fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
91971438e86SJunchao Zhang     if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
920cd620004SJunchao Zhang #endif
921cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
922cd620004SJunchao Zhang   }
923cd620004SJunchao Zhang   PetscFunctionReturn(0);
924cd620004SJunchao Zhang }
925cd620004SJunchao Zhang 
926cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
927cd620004SJunchao Zhang {
928cd620004SJunchao Zhang   PetscLogDouble flops;
929cd620004SJunchao Zhang   PetscErrorCode ierr;
930cd620004SJunchao Zhang 
931cd620004SJunchao Zhang   PetscFunctionBegin;
93283df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
933cd620004SJunchao Zhang     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
9347fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
93571438e86SJunchao Zhang     if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
936cd620004SJunchao Zhang #endif
937cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
938cd620004SJunchao Zhang   }
939cd620004SJunchao Zhang   PetscFunctionReturn(0);
940cd620004SJunchao Zhang }
941cd620004SJunchao Zhang 
942cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
943*4165533cSJose E. Roman   Input Parameters:
944cd620004SJunchao Zhang   +sf      - The StarForest
945cd620004SJunchao Zhang   .link    - The link
946cd620004SJunchao Zhang   .count   - Number of entries to unpack
947cd620004SJunchao Zhang   .start   - The first index, significent when indices=NULL
948cd620004SJunchao Zhang   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
949cd620004SJunchao Zhang   .buf     - A contiguous buffer to unpack from
950cd620004SJunchao Zhang   -op      - Operation after unpack
951cd620004SJunchao Zhang 
952*4165533cSJose E. Roman   Output Parameters:
953cd620004SJunchao Zhang   .data    - The data to unpack to
954cd620004SJunchao Zhang */
955cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
956cd620004SJunchao Zhang {
957cd620004SJunchao Zhang   PetscFunctionBegin;
958cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
959cd620004SJunchao Zhang   {
960cd620004SJunchao Zhang     PetscErrorCode ierr;
961cd620004SJunchao Zhang     PetscInt       i;
962cd620004SJunchao Zhang     PetscMPIInt    n;
963cd620004SJunchao Zhang     if (indices) {
964cd620004SJunchao Zhang       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
965cd620004SJunchao Zhang          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
966cd620004SJunchao Zhang       */
967ffc4695bSBarry Smith       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);}
968cd620004SJunchao Zhang     } else {
969cd620004SJunchao Zhang       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
970ffc4695bSBarry Smith       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr);
971cd620004SJunchao Zhang     }
972cd620004SJunchao Zhang   }
973b458e8f1SJose E. Roman   PetscFunctionReturn(0);
974cd620004SJunchao Zhang #else
975cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
976cd620004SJunchao Zhang #endif
977cd620004SJunchao Zhang }
978cd620004SJunchao Zhang 
979fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
980cd620004SJunchao Zhang {
981cd620004SJunchao Zhang   PetscFunctionBegin;
982cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
983cd620004SJunchao Zhang   {
984cd620004SJunchao Zhang     PetscErrorCode ierr;
985cd620004SJunchao Zhang     PetscInt       i,disp;
986fcc7397dSJunchao Zhang     if (!srcIdx) {
987fcc7397dSJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
988fcc7397dSJunchao Zhang     } else {
989cd620004SJunchao Zhang       for (i=0; i<count; i++) {
990fcc7397dSJunchao Zhang         disp = dstIdx? dstIdx[i] : dstStart + i;
991ffc4695bSBarry Smith         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);
992fcc7397dSJunchao Zhang       }
993cd620004SJunchao Zhang     }
994cd620004SJunchao Zhang   }
995b458e8f1SJose E. Roman   PetscFunctionReturn(0);
996cd620004SJunchao Zhang #else
997cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
998cd620004SJunchao Zhang #endif
999cd620004SJunchao Zhang }
1000cd620004SJunchao Zhang 
1001cd620004SJunchao Zhang /*=============================================================================
1002cd620004SJunchao Zhang               Pack/Unpack/Fetch/Scatter routines
1003cd620004SJunchao Zhang  ============================================================================*/
1004cd620004SJunchao Zhang 
1005cd620004SJunchao Zhang /* Pack rootdata to rootbuf
1006*4165533cSJose E. Roman   Input Parameters:
1007cd620004SJunchao Zhang   + sf       - The SF this packing works on.
1008cd620004SJunchao Zhang   . link     - It gives the memtype of the roots and also provides root buffer.
1009cd620004SJunchao Zhang   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1010cd620004SJunchao Zhang   - rootdata - Where to read the roots.
1011cd620004SJunchao Zhang 
1012cd620004SJunchao Zhang   Notes:
1013cd620004SJunchao Zhang   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
101471438e86SJunchao Zhang   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
1015cd620004SJunchao Zhang  */
101671438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1017cd620004SJunchao Zhang {
1018cd620004SJunchao Zhang   PetscErrorCode   ierr;
1019cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1020cd620004SJunchao Zhang   PetscInt         count,start;
1021fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1022cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1023fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1024fcc7397dSJunchao Zhang 
1025cd620004SJunchao Zhang   PetscFunctionBegin;
1026cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
102771438e86SJunchao Zhang   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1028fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1029cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1030fcc7397dSJunchao Zhang     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1031cd620004SJunchao Zhang   }
1032cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1033cd620004SJunchao Zhang   PetscFunctionReturn(0);
1034cd620004SJunchao Zhang }
1035cd620004SJunchao Zhang 
1036cd620004SJunchao Zhang /* Pack leafdata to leafbuf */
103771438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1038cd620004SJunchao Zhang {
1039cd620004SJunchao Zhang   PetscErrorCode   ierr;
1040cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1041cd620004SJunchao Zhang   PetscInt         count,start;
1042fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1043cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1044fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1045cd620004SJunchao Zhang 
1046cd620004SJunchao Zhang   PetscFunctionBegin;
1047cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
104871438e86SJunchao Zhang   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1049fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1050cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1051fcc7397dSJunchao Zhang     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1052cd620004SJunchao Zhang   }
1053cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1054cd620004SJunchao Zhang   PetscFunctionReturn(0);
1055cd620004SJunchao Zhang }
1056cd620004SJunchao Zhang 
105771438e86SJunchao Zhang /* Pack rootdata to rootbuf, which are in the same memory space */
105871438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
105971438e86SJunchao Zhang {
106071438e86SJunchao Zhang   PetscErrorCode   ierr;
106171438e86SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
106271438e86SJunchao Zhang 
106371438e86SJunchao Zhang   PetscFunctionBegin;
106471438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
106571438e86SJunchao Zhang     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
106671438e86SJunchao Zhang     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */
106771438e86SJunchao Zhang   }
106871438e86SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
106971438e86SJunchao Zhang   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);}
107071438e86SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
107171438e86SJunchao Zhang   PetscFunctionReturn(0);
107271438e86SJunchao Zhang }
107371438e86SJunchao Zhang /* Pack leafdata to leafbuf, which are in the same memory space */
107471438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
107571438e86SJunchao Zhang {
107671438e86SJunchao Zhang   PetscErrorCode   ierr;
107771438e86SJunchao Zhang 
107871438e86SJunchao Zhang   PetscFunctionBegin;
107971438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
108071438e86SJunchao Zhang     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
108171438e86SJunchao Zhang     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
108271438e86SJunchao Zhang   }
108371438e86SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
108471438e86SJunchao Zhang   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);}
108571438e86SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
108671438e86SJunchao Zhang   PetscFunctionReturn(0);
108771438e86SJunchao Zhang }
108871438e86SJunchao Zhang 
108971438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1090cd620004SJunchao Zhang {
1091cd620004SJunchao Zhang   PetscErrorCode   ierr;
1092cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1093cd620004SJunchao Zhang   PetscInt         count,start;
1094cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1095fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1096cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1097fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1098cd620004SJunchao Zhang 
1099cd620004SJunchao Zhang   PetscFunctionBegin;
110071438e86SJunchao Zhang   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1101cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1102cd620004SJunchao Zhang     if (UnpackAndOp) {
1103fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1104fcc7397dSJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1105cd620004SJunchao Zhang     } else {
1106fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1107cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1108cd620004SJunchao Zhang     }
1109cd620004SJunchao Zhang   }
1110cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1111cd620004SJunchao Zhang   PetscFunctionReturn(0);
1112cd620004SJunchao Zhang }
1113cd620004SJunchao Zhang 
111471438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1115cd620004SJunchao Zhang {
1116cd620004SJunchao Zhang   PetscErrorCode   ierr;
1117cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1118cd620004SJunchao Zhang   PetscInt         count,start;
1119fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1120cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1121fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1122cd620004SJunchao Zhang 
1123cd620004SJunchao Zhang   PetscFunctionBegin;
112471438e86SJunchao Zhang   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1125cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1126cd620004SJunchao Zhang     if (UnpackAndOp) {
1127fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1128fcc7397dSJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1129cd620004SJunchao Zhang     } else {
1130fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1131cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1132cd620004SJunchao Zhang     }
1133cd620004SJunchao Zhang   }
1134cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
113571438e86SJunchao Zhang   PetscFunctionReturn(0);
113671438e86SJunchao Zhang }
113771438e86SJunchao Zhang /* Unpack rootbuf to rootdata, which are in the same memory space */
113871438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
113971438e86SJunchao Zhang {
114071438e86SJunchao Zhang   PetscErrorCode   ierr;
114171438e86SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
114271438e86SJunchao Zhang 
114371438e86SJunchao Zhang   PetscFunctionBegin;
114471438e86SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
114571438e86SJunchao Zhang   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);}
1146cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
114771438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
114871438e86SJunchao Zhang     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
114971438e86SJunchao Zhang     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
115071438e86SJunchao Zhang   }
1151cd620004SJunchao Zhang   PetscFunctionReturn(0);
1152cd620004SJunchao Zhang }
1153cd620004SJunchao Zhang 
115471438e86SJunchao Zhang /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
115571438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
115671438e86SJunchao Zhang {
115771438e86SJunchao Zhang   PetscErrorCode   ierr;
115871438e86SJunchao Zhang 
115971438e86SJunchao Zhang   PetscFunctionBegin;
116071438e86SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
11611e1ea65dSPierre Jolivet   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);CHKERRQ(ierr);}
116271438e86SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
116371438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
116471438e86SJunchao Zhang     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);}  /* Used by SF nvshmem */
116571438e86SJunchao Zhang     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
116671438e86SJunchao Zhang   }
116771438e86SJunchao Zhang   PetscFunctionReturn(0);
116871438e86SJunchao Zhang }
116971438e86SJunchao Zhang 
117071438e86SJunchao Zhang /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
117171438e86SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op)
1172cd620004SJunchao Zhang {
1173cd620004SJunchao Zhang   PetscErrorCode     ierr;
1174cd620004SJunchao Zhang   const PetscInt     *rootindices = NULL;
1175cd620004SJunchao Zhang   PetscInt           count,start;
1176cd620004SJunchao Zhang   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1177fcc7397dSJunchao Zhang   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1178cd620004SJunchao Zhang   PetscMemType       rootmtype = link->rootmtype;
1179fcc7397dSJunchao Zhang   PetscSFPackOpt     opt = NULL;
1180cd620004SJunchao Zhang 
1181cd620004SJunchao Zhang   PetscFunctionBegin;
1182cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
118371438e86SJunchao Zhang   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1184cd620004SJunchao Zhang     /* Do FetchAndOp on rootdata with rootbuf */
118571438e86SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr);
118671438e86SJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
118771438e86SJunchao Zhang     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr);
1188cd620004SJunchao Zhang   }
118971438e86SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr);
1190cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1191cd620004SJunchao Zhang   PetscFunctionReturn(0);
1192cd620004SJunchao Zhang }
1193cd620004SJunchao Zhang 
119471438e86SJunchao Zhang PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op)
1195cd620004SJunchao Zhang {
1196cd620004SJunchao Zhang   PetscErrorCode       ierr;
1197cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1198cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1199cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1200fcc7397dSJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
120171438e86SJunchao Zhang   PetscMemType         rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype;
1202fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
120371438e86SJunchao Zhang   PetscInt             buflen = sf->leafbuflen[PETSCSF_LOCAL];
120471438e86SJunchao Zhang   char                 *srcbuf = NULL,*dstbuf = NULL;
120571438e86SJunchao Zhang   PetscBool            dstdups;
1206cd620004SJunchao Zhang 
1207362febeeSStefano Zampini   PetscFunctionBegin;
120871438e86SJunchao Zhang   if (!buflen) PetscFunctionReturn(0);
120971438e86SJunchao Zhang   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
121071438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
1211cd620004SJunchao Zhang       ierr     = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
121271438e86SJunchao Zhang       srcmtype = rootmtype;
121371438e86SJunchao Zhang       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
121471438e86SJunchao Zhang       dstmtype = leafmtype;
121571438e86SJunchao Zhang       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
121671438e86SJunchao Zhang     } else {
121771438e86SJunchao Zhang       ierr     = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
121871438e86SJunchao Zhang       srcmtype = leafmtype;
121971438e86SJunchao Zhang       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
122071438e86SJunchao Zhang       dstmtype = rootmtype;
122171438e86SJunchao Zhang       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
122271438e86SJunchao Zhang     }
122371438e86SJunchao Zhang     ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr);
122471438e86SJunchao Zhang     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
122571438e86SJunchao Zhang     if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);}
122671438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
1227cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1228cd620004SJunchao Zhang     } else {
122971438e86SJunchao Zhang       ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
123071438e86SJunchao Zhang     }
123171438e86SJunchao Zhang   } else {
123271438e86SJunchao Zhang     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
123371438e86SJunchao Zhang     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
123471438e86SJunchao Zhang     ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr);
1235cd620004SJunchao Zhang     if (ScatterAndOp) {
1236fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1237fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
123871438e86SJunchao Zhang       if (direction == PETSCSF_ROOT2LEAF) {
1239fcc7397dSJunchao Zhang         ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1240cd620004SJunchao Zhang       } else {
1241fcc7397dSJunchao Zhang         ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
124271438e86SJunchao Zhang       }
1243cd620004SJunchao Zhang     } else {
1244fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1245fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
124671438e86SJunchao Zhang       if (direction == PETSCSF_ROOT2LEAF) {
124771438e86SJunchao Zhang         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
124871438e86SJunchao Zhang       } else {
1249fcc7397dSJunchao Zhang         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1250cd620004SJunchao Zhang       }
1251cd620004SJunchao Zhang     }
125271438e86SJunchao Zhang   }
1253cd620004SJunchao Zhang   PetscFunctionReturn(0);
1254cd620004SJunchao Zhang }
1255cd620004SJunchao Zhang 
1256cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */
1257cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1258cd620004SJunchao Zhang {
1259cd620004SJunchao Zhang   PetscErrorCode       ierr;
1260cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1261cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1262cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1263fcc7397dSJunchao Zhang   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1264cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1265fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1266cd620004SJunchao Zhang 
1267cd620004SJunchao Zhang   PetscFunctionBegin;
1268cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1269cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1270cd620004SJunchao Zhang     /* The local communication has to go through pack and unpack */
1271cd620004SJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1272cd620004SJunchao Zhang   } else {
1273fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1274fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1275cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1276fcc7397dSJunchao Zhang     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1277cd620004SJunchao Zhang   }
127840e23c03SJunchao Zhang   PetscFunctionReturn(0);
127940e23c03SJunchao Zhang }
128040e23c03SJunchao Zhang 
128140e23c03SJunchao Zhang /*
1282cd620004SJunchao Zhang   Create per-rank pack/unpack optimizations based on indice patterns
128340e23c03SJunchao Zhang 
128440e23c03SJunchao Zhang    Input Parameters:
1285fcc7397dSJunchao Zhang   +  n       - Number of destination ranks
1286eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1287b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
128840e23c03SJunchao Zhang 
128940e23c03SJunchao Zhang    Output Parameters:
1290cd620004SJunchao Zhang   +  opt     - Pack optimizations. NULL if no optimizations.
129140e23c03SJunchao Zhang */
1292cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
129340e23c03SJunchao Zhang {
129440e23c03SJunchao Zhang   PetscErrorCode ierr;
1295fcc7397dSJunchao Zhang   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1296fcc7397dSJunchao Zhang   PetscBool      optimizable = PETSC_TRUE;
129740e23c03SJunchao Zhang   PetscSFPackOpt opt;
129840e23c03SJunchao Zhang 
129940e23c03SJunchao Zhang   PetscFunctionBegin;
1300fcc7397dSJunchao Zhang   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1301fcc7397dSJunchao Zhang   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1302fcc7397dSJunchao Zhang   opt->n      = opt->array[0] = n;
1303fcc7397dSJunchao Zhang   opt->offset = opt->array + 1;
1304fcc7397dSJunchao Zhang   opt->start  = opt->array + n   + 2;
1305fcc7397dSJunchao Zhang   opt->dx     = opt->array + 2*n + 2;
1306fcc7397dSJunchao Zhang   opt->dy     = opt->array + 3*n + 2;
1307fcc7397dSJunchao Zhang   opt->dz     = opt->array + 4*n + 2;
1308fcc7397dSJunchao Zhang   opt->X      = opt->array + 5*n + 2;
1309fcc7397dSJunchao Zhang   opt->Y      = opt->array + 6*n + 2;
1310fcc7397dSJunchao Zhang 
1311fcc7397dSJunchao Zhang   for (r=0; r<n; r++) { /* For each destination rank */
1312fcc7397dSJunchao Zhang     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1313fcc7397dSJunchao Zhang     p     = offset[r];
1314fcc7397dSJunchao Zhang     start = idx[p]; /* First index for this rank */
1315fcc7397dSJunchao Zhang     p++;
1316fcc7397dSJunchao Zhang 
1317fcc7397dSJunchao Zhang     /* Search in X dimension */
1318fcc7397dSJunchao Zhang     for (dx=1; dx<m; dx++,p++) {
1319fcc7397dSJunchao Zhang       if (start+dx != idx[p]) break;
1320b23bfdefSJunchao Zhang     }
1321b23bfdefSJunchao Zhang 
1322fcc7397dSJunchao Zhang     dydz = m/dx;
1323fcc7397dSJunchao Zhang     X    = dydz > 1 ? (idx[p]-start) : dx;
1324fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1325fcc7397dSJunchao Zhang     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1326fcc7397dSJunchao Zhang     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1327fcc7397dSJunchao Zhang       for (i=0; i<dx; i++,p++) {
1328fcc7397dSJunchao Zhang         if (start+X*dy+i != idx[p]) {
1329fcc7397dSJunchao Zhang           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1330fcc7397dSJunchao Zhang           else goto Z_dimension;
133140e23c03SJunchao Zhang         }
133240e23c03SJunchao Zhang       }
133340e23c03SJunchao Zhang     }
133440e23c03SJunchao Zhang 
1335fcc7397dSJunchao Zhang Z_dimension:
1336fcc7397dSJunchao Zhang     dz = m/(dx*dy);
1337fcc7397dSJunchao Zhang     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1338fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1339fcc7397dSJunchao Zhang     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1340fcc7397dSJunchao Zhang     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1341fcc7397dSJunchao Zhang       for (j=0; j<dy; j++) {
1342fcc7397dSJunchao Zhang         for (i=0; i<dx; i++,p++) {
1343fcc7397dSJunchao Zhang           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
134440e23c03SJunchao Zhang         }
134540e23c03SJunchao Zhang       }
134640e23c03SJunchao Zhang     }
1347fcc7397dSJunchao Zhang     opt->start[r] = start;
1348fcc7397dSJunchao Zhang     opt->dx[r]    = dx;
1349fcc7397dSJunchao Zhang     opt->dy[r]    = dy;
1350fcc7397dSJunchao Zhang     opt->dz[r]    = dz;
1351fcc7397dSJunchao Zhang     opt->X[r]     = X;
1352fcc7397dSJunchao Zhang     opt->Y[r]     = Y;
135340e23c03SJunchao Zhang   }
135440e23c03SJunchao Zhang 
1355fcc7397dSJunchao Zhang finish:
1356fcc7397dSJunchao Zhang   /* If not optimizable, free arrays to save memory */
1357fcc7397dSJunchao Zhang   if (!n || !optimizable) {
1358fcc7397dSJunchao Zhang     ierr = PetscFree(opt->array);CHKERRQ(ierr);
135940e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
136040e23c03SJunchao Zhang     *out = NULL;
1361fcc7397dSJunchao Zhang   } else {
1362fcc7397dSJunchao Zhang     opt->offset[0] = 0;
1363fcc7397dSJunchao Zhang     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1364fcc7397dSJunchao Zhang     *out = opt;
1365fcc7397dSJunchao Zhang   }
136640e23c03SJunchao Zhang   PetscFunctionReturn(0);
136740e23c03SJunchao Zhang }
136840e23c03SJunchao Zhang 
136920c24465SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
137040e23c03SJunchao Zhang {
137140e23c03SJunchao Zhang   PetscErrorCode ierr;
137240e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
137340e23c03SJunchao Zhang 
137440e23c03SJunchao Zhang   PetscFunctionBegin;
137540e23c03SJunchao Zhang   if (opt) {
137620c24465SJunchao Zhang     ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr);
137740e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
137840e23c03SJunchao Zhang     *out = NULL;
137940e23c03SJunchao Zhang   }
138040e23c03SJunchao Zhang   PetscFunctionReturn(0);
138140e23c03SJunchao Zhang }
1382cd620004SJunchao Zhang 
1383cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1384cd620004SJunchao Zhang {
1385cd620004SJunchao Zhang   PetscErrorCode ierr;
1386cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1387cd620004SJunchao Zhang   PetscInt       i,j;
1388cd620004SJunchao Zhang 
1389cd620004SJunchao Zhang   PetscFunctionBegin;
1390cd620004SJunchao Zhang   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1391cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Set defaults */
1392cd620004SJunchao Zhang     sf->leafstart[i]   = 0;
1393cd620004SJunchao Zhang     sf->leafcontig[i]  = PETSC_TRUE;
1394cd620004SJunchao Zhang     sf->leafdups[i]    = PETSC_FALSE;
1395cd620004SJunchao Zhang     bas->rootstart[i]  = 0;
1396cd620004SJunchao Zhang     bas->rootcontig[i] = PETSC_TRUE;
1397cd620004SJunchao Zhang     bas->rootdups[i]   = PETSC_FALSE;
1398cd620004SJunchao Zhang   }
1399cd620004SJunchao Zhang 
1400cd620004SJunchao Zhang   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1401cd620004SJunchao Zhang   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1402cd620004SJunchao Zhang 
1403cd620004SJunchao Zhang   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1404cd620004SJunchao Zhang   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1405cd620004SJunchao Zhang 
1406cd620004SJunchao Zhang   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1407cd620004SJunchao Zhang   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1408cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1409cd620004SJunchao Zhang   }
1410cd620004SJunchao Zhang   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1411cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1412cd620004SJunchao Zhang   }
1413cd620004SJunchao Zhang 
1414cd620004SJunchao Zhang   /* If not, see if we can have per-rank optimizations by doing index analysis */
1415cd620004SJunchao Zhang   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1416cd620004SJunchao Zhang   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1417cd620004SJunchao Zhang 
1418cd620004SJunchao Zhang   /* Are root indices for self and remote contiguous? */
1419cd620004SJunchao Zhang   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1420cd620004SJunchao Zhang   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1421cd620004SJunchao Zhang 
1422cd620004SJunchao Zhang   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1423cd620004SJunchao Zhang   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1424cd620004SJunchao Zhang 
1425cd620004SJunchao Zhang   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1426cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1427cd620004SJunchao Zhang   }
1428cd620004SJunchao Zhang   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1429cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1430cd620004SJunchao Zhang   }
1431cd620004SJunchao Zhang 
1432cd620004SJunchao Zhang   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1433cd620004SJunchao Zhang   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1434cd620004SJunchao Zhang 
14357fd2d3dbSJunchao Zhang  #if defined(PETSC_HAVE_DEVICE)
1436cd620004SJunchao Zhang     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1437013b3241SStefano Zampini   if (PetscDefined(HAVE_DEVICE)) {
1438013b3241SStefano Zampini     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1439013b3241SStefano Zampini     if (!sf->leafcontig[0]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1440013b3241SStefano Zampini     if (!sf->leafcontig[1]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1441013b3241SStefano Zampini     if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1442013b3241SStefano Zampini     if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1443013b3241SStefano Zampini   }
1444cd620004SJunchao Zhang #endif
1445cd620004SJunchao Zhang   PetscFunctionReturn(0);
1446cd620004SJunchao Zhang }
1447cd620004SJunchao Zhang 
1448cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1449cd620004SJunchao Zhang {
1450cd620004SJunchao Zhang   PetscErrorCode ierr;
1451cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1452cd620004SJunchao Zhang   PetscInt       i;
1453cd620004SJunchao Zhang 
1454cd620004SJunchao Zhang   PetscFunctionBegin;
1455cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
145620c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
145720c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
14587fd2d3dbSJunchao Zhang    #if defined(PETSC_HAVE_DEVICE)
145920c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
146020c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
14617fd2d3dbSJunchao Zhang    #endif
1462cd620004SJunchao Zhang   }
1463cd620004SJunchao Zhang   PetscFunctionReturn(0);
1464cd620004SJunchao Zhang }
1465