xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 362febeeeb69b91ebadcb4b2dc0a22cb6dfc4097)
120c24465SJunchao Zhang #include "petsc/private/sfimpl.h"
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
57fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
67fd2d3dbSJunchao Zhang 
7eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
8eb02082bSJunchao Zhang #include <cuda_runtime.h>
97fd2d3dbSJunchao Zhang #include <petsccublas.h>
10eb02082bSJunchao Zhang #endif
1159af0bd3SScott Kruger #if defined(PETSC_HAVE_HIP)
1259af0bd3SScott Kruger #include <hip/hip_runtime.h>
1359af0bd3SScott Kruger #include <petschipblas.h>
1459af0bd3SScott Kruger #endif
1540e23c03SJunchao Zhang /*
1640e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
1740e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1840e23c03SJunchao Zhang  */
1940e23c03SJunchao Zhang 
20cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
2140e23c03SJunchao Zhang 
22cd620004SJunchao Zhang /* Operations working like s += t */
23cd620004SJunchao Zhang #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
24cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
25cd620004SJunchao Zhang #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
26cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
27cd620004SJunchao Zhang /* Ref MPI MAXLOC */
28cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \
29cd620004SJunchao Zhang   do {                                       \
30cd620004SJunchao Zhang     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
31cd620004SJunchao Zhang     else if (!((s).u op (t).u)) s = t;           \
32cd620004SJunchao Zhang   } while (0)
3340e23c03SJunchao Zhang 
3440e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
3540e23c03SJunchao Zhang 
3640e23c03SJunchao Zhang    Arguments of the macro:
37b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
38fcc7397dSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bsz.
39b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
4040e23c03SJunchao Zhang 
4140e23c03SJunchao Zhang    Arguments of the Pack routine:
42cd620004SJunchao Zhang    +count     Number of indices in idx[].
43fcc7397dSJunchao Zhang    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
44fcc7397dSJunchao Zhang    .opt       Per-pack optimization plan. NULL means no such plan.
45fcc7397dSJunchao Zhang    .idx       Indices of entries to packed.
46eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
47cd620004SJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
48cd620004SJunchao Zhang    -packed    Address of the packed data.
4940e23c03SJunchao Zhang  */
50b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \
51fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
52b23bfdefSJunchao Zhang   {                                                                                                          \
5340e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
54b23bfdefSJunchao Zhang     const Type     *u = (const Type*)unpacked,*u2;                                                           \
55b23bfdefSJunchao Zhang     Type           *p = (Type*)packed,*p2;                                                                   \
56fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
57fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
58b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
5940e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
60fcc7397dSJunchao Zhang     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
61fcc7397dSJunchao Zhang     else if (opt) { /* has optimizations available */                                                        \
62fcc7397dSJunchao Zhang       p2 = p;                                                                                                \
63fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
64fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
65fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
66fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
67fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
68fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
69fcc7397dSJunchao Zhang             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
70fcc7397dSJunchao Zhang             p2  += opt->dx[r]*MBS;                                                                           \
71fcc7397dSJunchao Zhang           }                                                                                                  \
72fcc7397dSJunchao Zhang       }                                                                                                      \
73fcc7397dSJunchao Zhang     } else {                                                                                                 \
74b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
75eb02082bSJunchao Zhang         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
76eb02082bSJunchao Zhang           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
77b23bfdefSJunchao Zhang             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
7840e23c03SJunchao Zhang     }                                                                                                        \
7940e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
8040e23c03SJunchao Zhang   }
8140e23c03SJunchao Zhang 
82cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
83cd620004SJunchao Zhang                 and inserts into a sparse array.
8440e23c03SJunchao Zhang 
8540e23c03SJunchao Zhang    Arguments:
86b23bfdefSJunchao Zhang   .Type       Type of the data
8740e23c03SJunchao Zhang   .BS         Block size for vectorization
88b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
8940e23c03SJunchao Zhang 
9040e23c03SJunchao Zhang   Notes:
9140e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
9240e23c03SJunchao Zhang  */
93cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ)               \
94fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
95b23bfdefSJunchao Zhang   {                                                                                                          \
9640e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
97b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
98fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
99fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
100fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
101b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
10240e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
103b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
104fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
105fcc7397dSJunchao Zhang       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
106fcc7397dSJunchao Zhang     } else if (opt) { /* has optimizations available */                                                      \
107fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
108fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
109fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
110fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
111fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
112fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
113fcc7397dSJunchao Zhang             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
114fcc7397dSJunchao Zhang             p   += opt->dx[r]*MBS;                                                                           \
115fcc7397dSJunchao Zhang           }                                                                                                  \
116fcc7397dSJunchao Zhang       }                                                                                                      \
117fcc7397dSJunchao Zhang     } else {                                                                                                 \
118b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
119b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
120cd620004SJunchao Zhang           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
12140e23c03SJunchao Zhang     }                                                                                                        \
12240e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
12340e23c03SJunchao Zhang   }
12440e23c03SJunchao Zhang 
125cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
12640e23c03SJunchao Zhang 
12740e23c03SJunchao Zhang    Arguments:
128cd620004SJunchao Zhang   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
129b23bfdefSJunchao Zhang   .Type       Type of the data
13040e23c03SJunchao Zhang   .BS         Block size for vectorization
131b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
132cd620004SJunchao Zhang   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
133cd620004SJunchao Zhang   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
13440e23c03SJunchao Zhang  */
135cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
136fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
137b23bfdefSJunchao Zhang   {                                                                                                          \
138cd620004SJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
139fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
140fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
141fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
142b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
14340e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
144b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
145fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
146cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
147cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
148cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
149cd620004SJunchao Zhang             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
150fcc7397dSJunchao Zhang     } else if (opt) { /* idx[] has patterns */                                                               \
151fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
152fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
153fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
154fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
155fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
156fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
157fcc7397dSJunchao Zhang             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
158fcc7397dSJunchao Zhang             p += opt->dx[r]*MBS;                                                                             \
159fcc7397dSJunchao Zhang           }                                                                                                  \
160fcc7397dSJunchao Zhang       }                                                                                                      \
161fcc7397dSJunchao Zhang     } else {                                                                                                 \
162cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
163cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
164cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
165cd620004SJunchao Zhang             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
166cd620004SJunchao Zhang     }                                                                                                        \
167cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
168cd620004SJunchao Zhang   }
169cd620004SJunchao Zhang 
170cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
171fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
172cd620004SJunchao Zhang   {                                                                                                          \
173fcc7397dSJunchao Zhang     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
174fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
175fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
176fcc7397dSJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
177cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
178fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
179fcc7397dSJunchao Zhang       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
180fcc7397dSJunchao Zhang       l = i*MBS;                                                                                             \
181b23bfdefSJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
182b23bfdefSJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
183fcc7397dSJunchao Zhang           tmp = u[r+j*BS+k];                                                                                 \
184fcc7397dSJunchao Zhang           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
185fcc7397dSJunchao Zhang           p[l+j*BS+k] = tmp;                                                                                 \
186cd620004SJunchao Zhang         }                                                                                                    \
187cd620004SJunchao Zhang     }                                                                                                        \
188cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
189cd620004SJunchao Zhang   }
190cd620004SJunchao Zhang 
191cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
192fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
193cd620004SJunchao Zhang   {                                                                                                          \
194fcc7397dSJunchao Zhang     PetscErrorCode ierr;                                                                                     \
195fcc7397dSJunchao Zhang     const Type     *u = (const Type*)src;                                                                    \
196fcc7397dSJunchao Zhang     Type           *v = (Type*)dst;                                                                          \
197fcc7397dSJunchao Zhang     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
198cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
199cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
200cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
201fcc7397dSJunchao Zhang     if (!srcIdx) { /* src is contiguous */                                                                   \
202fcc7397dSJunchao Zhang       u += srcStart*MBS;                                                                                     \
203fcc7397dSJunchao Zhang       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
204fcc7397dSJunchao Zhang     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
205fcc7397dSJunchao Zhang       u += srcOpt->start[0]*MBS;                                                                             \
206fcc7397dSJunchao Zhang       v += dstStart*MBS;                                                                                     \
207fcc7397dSJunchao Zhang       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
208fcc7397dSJunchao Zhang       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
209fcc7397dSJunchao Zhang         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
210fcc7397dSJunchao Zhang           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
211fcc7397dSJunchao Zhang           v += srcOpt->dx[0]*MBS;                                                                            \
212fcc7397dSJunchao Zhang         }                                                                                                    \
213fcc7397dSJunchao Zhang     } else { /* all other cases */                                                                           \
214fcc7397dSJunchao Zhang       for (i=0; i<count; i++) {                                                                              \
215fcc7397dSJunchao Zhang         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
216fcc7397dSJunchao Zhang         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
217cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
218fcc7397dSJunchao Zhang           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
219fcc7397dSJunchao Zhang       }                                                                                                      \
220cd620004SJunchao Zhang     }                                                                                                        \
221cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
222cd620004SJunchao Zhang   }
223cd620004SJunchao Zhang 
224cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
225fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
226cd620004SJunchao Zhang   {                                                                                                          \
227fcc7397dSJunchao Zhang     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
228fcc7397dSJunchao Zhang     const Type     *ldata = (const Type*)leafdata;                                                           \
229fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
230cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
231cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
232cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
233fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
234fcc7397dSJunchao Zhang       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
235fcc7397dSJunchao Zhang       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
236cd620004SJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
237cd620004SJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
238fcc7397dSJunchao Zhang           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
239fcc7397dSJunchao Zhang           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
24040e23c03SJunchao Zhang         }                                                                                                    \
24140e23c03SJunchao Zhang     }                                                                                                        \
24240e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
24340e23c03SJunchao Zhang   }
24440e23c03SJunchao Zhang 
245b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
246b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ)                                                      \
247b23bfdefSJunchao Zhang   DEF_PackFunc(Type,BS,EQ)                                                        \
248cd620004SJunchao Zhang   DEF_UnpackFunc(Type,BS,EQ)                                                      \
249cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
250cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
251eb02082bSJunchao Zhang     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
252eb02082bSJunchao Zhang     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
253cd620004SJunchao Zhang     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
25440e23c03SJunchao Zhang   }
25540e23c03SJunchao Zhang 
256b23bfdefSJunchao Zhang /* Add, Mult ops */
257b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ)                                                       \
258cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
259cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
260cd620004SJunchao Zhang   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
261cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
262cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
263cd620004SJunchao Zhang   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
264cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
265eb02082bSJunchao Zhang     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
266eb02082bSJunchao Zhang     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
267eb02082bSJunchao Zhang     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
268cd620004SJunchao Zhang     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
269cd620004SJunchao Zhang     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
270cd620004SJunchao Zhang     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
27140e23c03SJunchao Zhang   }
27240e23c03SJunchao Zhang 
273b23bfdefSJunchao Zhang /* Max, Min ops */
274b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ)                                                       \
275cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
276cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
277cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
278cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
279cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
280eb02082bSJunchao Zhang     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
281eb02082bSJunchao Zhang     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
282cd620004SJunchao Zhang     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
283cd620004SJunchao Zhang     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
284b23bfdefSJunchao Zhang   }
285b23bfdefSJunchao Zhang 
286b23bfdefSJunchao Zhang /* Logical ops.
287cd620004SJunchao Zhang   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
28840e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
28940e23c03SJunchao Zhang  */
290b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ)                                                       \
291cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
292cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
293cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
294cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
295cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
296cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
297cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
298eb02082bSJunchao Zhang     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
299eb02082bSJunchao Zhang     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
300eb02082bSJunchao Zhang     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
301cd620004SJunchao Zhang     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
302cd620004SJunchao Zhang     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
303cd620004SJunchao Zhang     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
30440e23c03SJunchao Zhang   }
30540e23c03SJunchao Zhang 
306b23bfdefSJunchao Zhang /* Bitwise ops */
307b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ)                                                       \
308cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
309cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
310cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
311cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
312cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
313cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
314cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
315eb02082bSJunchao Zhang     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
316eb02082bSJunchao Zhang     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
317eb02082bSJunchao Zhang     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
318cd620004SJunchao Zhang     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
319cd620004SJunchao Zhang     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
320cd620004SJunchao Zhang     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
32140e23c03SJunchao Zhang   }
32240e23c03SJunchao Zhang 
323cd620004SJunchao Zhang /* Maxloc, Minloc ops */
324cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ)                                                      \
325cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
326cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
327cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
328cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
329cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
330cd620004SJunchao Zhang     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
331cd620004SJunchao Zhang     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
332cd620004SJunchao Zhang     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
333cd620004SJunchao Zhang     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
33440e23c03SJunchao Zhang   }
33540e23c03SJunchao Zhang 
336b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ)                                               \
337b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
338b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
339b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
340b23bfdefSJunchao Zhang   DEF_Log(Type,BS,EQ)                                                             \
341b23bfdefSJunchao Zhang   DEF_Bit(Type,BS,EQ)                                                             \
342cd620004SJunchao Zhang   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
343b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
344b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
345b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
346b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
347b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
34840e23c03SJunchao Zhang   }
34940e23c03SJunchao Zhang 
350b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ)                                                  \
351b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
352b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
353b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
354cd620004SJunchao Zhang   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
355b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
356b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
357b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
358b23bfdefSJunchao Zhang   }
35940e23c03SJunchao Zhang 
36040e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
361b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ)                                               \
362b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
363b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
364cd620004SJunchao Zhang   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
365b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
367b23bfdefSJunchao Zhang   }
36840e23c03SJunchao Zhang #endif
369b23bfdefSJunchao Zhang 
370b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ)                                                  \
371b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
372cd620004SJunchao Zhang   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
373b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374b23bfdefSJunchao Zhang   }
375b23bfdefSJunchao Zhang 
376b23bfdefSJunchao Zhang /* Maxloc, Minloc */
377cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ)                                                  \
378cd620004SJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
379cd620004SJunchao Zhang   DEF_Xloc(Type,BS,EQ)                                                            \
380cd620004SJunchao Zhang   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
381cd620004SJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
382cd620004SJunchao Zhang     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
383b23bfdefSJunchao Zhang   }
384b23bfdefSJunchao Zhang 
385b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
386b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
387b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
388b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
389b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
390b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
391b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
392b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
393b23bfdefSJunchao Zhang 
394b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
395b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1)
396b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1)
397b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1)
398b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1)
399b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0)
400b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0)
401b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0)
402b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0)
403b23bfdefSJunchao Zhang #endif
404b23bfdefSJunchao Zhang 
405b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */
406b23bfdefSJunchao Zhang typedef signed char SignedChar;
407b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1)
408b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1)
409b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1)
410b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1)
411b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0)
412b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0)
413b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0)
414b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0)
415b23bfdefSJunchao Zhang 
416b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar;
417b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1)
418b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1)
419b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1)
420b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1)
421b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0)
422b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0)
423b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0)
424b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0)
425b23bfdefSJunchao Zhang 
426b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1)
427b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1)
428b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1)
429b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1)
430b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0)
431b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0)
432b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0)
433b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0)
434b23bfdefSJunchao Zhang 
435b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
436b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1)
437b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1)
438b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1)
439b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1)
440b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0)
441b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0)
442b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0)
443b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0)
444b23bfdefSJunchao Zhang #endif
445b23bfdefSJunchao Zhang 
446cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2
447cd620004SJunchao Zhang typedef struct {int u; int i;}           PairType(int,int);
448cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
449cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1)
450cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1)
451b23bfdefSJunchao Zhang 
452b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */
453b23bfdefSJunchao Zhang DEF_DumbType(char,1,1)
454b23bfdefSJunchao Zhang DEF_DumbType(char,2,1)
455b23bfdefSJunchao Zhang DEF_DumbType(char,4,1)
456b23bfdefSJunchao Zhang DEF_DumbType(char,1,0)
457b23bfdefSJunchao Zhang DEF_DumbType(char,2,0)
458b23bfdefSJunchao Zhang DEF_DumbType(char,4,0)
459b23bfdefSJunchao Zhang 
460eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
461b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1)
462b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1)
463b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1)
464b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1)
465b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0)
466b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0)
467b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0)
468b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0)
46940e23c03SJunchao Zhang 
47040e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
47140e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
47240e23c03SJunchao Zhang {
47340e23c03SJunchao Zhang   int ierr;
47440e23c03SJunchao Zhang   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
47540e23c03SJunchao Zhang   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
47640e23c03SJunchao Zhang   return MPI_SUCCESS;
47740e23c03SJunchao Zhang }
47840e23c03SJunchao Zhang #endif
47940e23c03SJunchao Zhang 
48071438e86SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)
48140e23c03SJunchao Zhang {
48240e23c03SJunchao Zhang   PetscErrorCode    ierr;
483cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
48471438e86SJunchao Zhang   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
48571438e86SJunchao Zhang 
48671438e86SJunchao Zhang   PetscFunctionBegin;
48771438e86SJunchao Zhang   /* Destroy device-specific fields */
48871438e86SJunchao Zhang   if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);}
48971438e86SJunchao Zhang 
49071438e86SJunchao Zhang   /* Destroy host related fields */
49171438e86SJunchao Zhang   if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);}
49271438e86SJunchao Zhang   if (!link->use_nvshmem) {
49371438e86SJunchao Zhang     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
49471438e86SJunchao Zhang       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);}
49571438e86SJunchao Zhang     }
49671438e86SJunchao Zhang     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
49771438e86SJunchao Zhang     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
49871438e86SJunchao Zhang       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
49971438e86SJunchao Zhang       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
50071438e86SJunchao Zhang     }
50171438e86SJunchao Zhang   }
50271438e86SJunchao Zhang   ierr = PetscFree(link);CHKERRQ(ierr);
50371438e86SJunchao Zhang   PetscFunctionReturn(0);
50471438e86SJunchao Zhang }
50571438e86SJunchao Zhang 
50671438e86SJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
50771438e86SJunchao Zhang {
50871438e86SJunchao Zhang   PetscErrorCode    ierr;
509cd620004SJunchao Zhang 
510cd620004SJunchao Zhang   PetscFunctionBegin;
511cd620004SJunchao Zhang   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
51271438e86SJunchao Zhang  #if defined(PETSC_HAVE_NVSHMEM)
51371438e86SJunchao Zhang   {
51471438e86SJunchao Zhang     PetscBool use_nvshmem;
51571438e86SJunchao Zhang     ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr);
51671438e86SJunchao Zhang     if (use_nvshmem) {
51771438e86SJunchao Zhang       ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
51871438e86SJunchao Zhang       PetscFunctionReturn(0);
519cd620004SJunchao Zhang     }
520cd620004SJunchao Zhang   }
5217fd2d3dbSJunchao Zhang  #endif
52271438e86SJunchao Zhang   ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr);
523cd620004SJunchao Zhang   PetscFunctionReturn(0);
524cd620004SJunchao Zhang }
525cd620004SJunchao Zhang 
526cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
527cd620004SJunchao Zhang    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
528cd620004SJunchao Zhang */
529cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
530cd620004SJunchao Zhang {
531cd620004SJunchao Zhang   PetscErrorCode       ierr;
532cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
533cd620004SJunchao Zhang   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
534cd620004SJunchao Zhang   const PetscInt       *rootoffset,*leafoffset;
535cd620004SJunchao Zhang   PetscMPIInt          n;
536cd620004SJunchao Zhang   MPI_Aint             disp;
537cd620004SJunchao Zhang   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
538cd620004SJunchao Zhang   MPI_Datatype         unit = link->unit;
539cd620004SJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
540cd620004SJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
541cd620004SJunchao Zhang 
542cd620004SJunchao Zhang   PetscFunctionBegin;
543cd620004SJunchao Zhang   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
544cd620004SJunchao Zhang   if (sf->persistent) {
545cd620004SJunchao Zhang     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
546cd620004SJunchao Zhang       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
547cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
548cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
549cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
550cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
551ffc4695bSBarry Smith           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
552cd620004SJunchao Zhang         }
553cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
554cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
555cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
556cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
557ffc4695bSBarry Smith           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
558cd620004SJunchao Zhang         }
559cd620004SJunchao Zhang       }
560cd620004SJunchao Zhang       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
561cd620004SJunchao Zhang     }
562cd620004SJunchao Zhang 
563cd620004SJunchao Zhang     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
564cd620004SJunchao Zhang       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
565cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
566cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
567cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
568cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
569ffc4695bSBarry Smith           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
570cd620004SJunchao Zhang         }
571cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
572cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
573cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
574cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
575ffc4695bSBarry Smith           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
576cd620004SJunchao Zhang         }
577cd620004SJunchao Zhang       }
578cd620004SJunchao Zhang       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
579cd620004SJunchao Zhang     }
580cd620004SJunchao Zhang   }
581cd620004SJunchao Zhang   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
582cd620004SJunchao Zhang   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
583cd620004SJunchao Zhang   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
584cd620004SJunchao Zhang   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
585cd620004SJunchao Zhang   PetscFunctionReturn(0);
586cd620004SJunchao Zhang }
587cd620004SJunchao Zhang 
588cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
589cd620004SJunchao Zhang {
590cd620004SJunchao Zhang   PetscErrorCode    ierr;
591cd620004SJunchao Zhang   PetscSFLink       link,*p;
59240e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
59340e23c03SJunchao Zhang 
59440e23c03SJunchao Zhang   PetscFunctionBegin;
59540e23c03SJunchao Zhang   /* Look for types in cache */
59640e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
59740e23c03SJunchao Zhang     PetscBool match;
59840e23c03SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
599637e6665SJunchao Zhang     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
60040e23c03SJunchao Zhang       switch (cmode) {
60140e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
60240e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
60340e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
60440e23c03SJunchao Zhang       }
60540e23c03SJunchao Zhang       *mylink = link;
60640e23c03SJunchao Zhang       PetscFunctionReturn(0);
60740e23c03SJunchao Zhang     }
60840e23c03SJunchao Zhang   }
60940e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
61040e23c03SJunchao Zhang }
61140e23c03SJunchao Zhang 
61271438e86SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink)
61340e23c03SJunchao Zhang {
61440e23c03SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
61571438e86SJunchao Zhang   PetscSFLink       link = *mylink;
61640e23c03SJunchao Zhang 
61740e23c03SJunchao Zhang   PetscFunctionBegin;
61871438e86SJunchao Zhang   link->rootdata = NULL;
61971438e86SJunchao Zhang   link->leafdata = NULL;
62071438e86SJunchao Zhang   link->next     = bas->avail;
62171438e86SJunchao Zhang   bas->avail     = link;
62271438e86SJunchao Zhang   *mylink        = NULL;
623eb02082bSJunchao Zhang   PetscFunctionReturn(0);
624eb02082bSJunchao Zhang }
625eb02082bSJunchao Zhang 
6269d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
627cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
6289d1c8addSJunchao Zhang {
6299d1c8addSJunchao Zhang   PetscErrorCode    ierr;
630cd620004SJunchao Zhang   PetscSFLink       link,*p;
6319d1c8addSJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
6329d1c8addSJunchao Zhang   PetscBool         match;
6339d1c8addSJunchao Zhang 
6349d1c8addSJunchao Zhang   PetscFunctionBegin;
635b458e8f1SJose E. Roman   if (PetscDefined(USE_DEBUG)) {
63618fb5014SJunchao Zhang     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
63718fb5014SJunchao Zhang        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
63818fb5014SJunchao Zhang     */
639637e6665SJunchao Zhang     if (rootdata || leafdata) {
6409d1c8addSJunchao Zhang       for (p=&bas->inuse; (link=*p); p=&link->next) {
6419d1c8addSJunchao Zhang         ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
642cd620004SJunchao Zhang         if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
6439d1c8addSJunchao Zhang       }
64418fb5014SJunchao Zhang     }
645b458e8f1SJose E. Roman   }
6469d1c8addSJunchao Zhang   PetscFunctionReturn(0);
6479d1c8addSJunchao Zhang }
6489d1c8addSJunchao Zhang 
64920c24465SJunchao Zhang static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
65020c24465SJunchao Zhang {
65120c24465SJunchao Zhang   PetscFunctionBegin;
65220c24465SJunchao Zhang   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
65320c24465SJunchao Zhang   PetscFunctionReturn(0);
65420c24465SJunchao Zhang }
65520c24465SJunchao Zhang 
656cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
65740e23c03SJunchao Zhang {
65840e23c03SJunchao Zhang   PetscErrorCode ierr;
659b23bfdefSJunchao Zhang   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
660b23bfdefSJunchao Zhang   PetscBool      is2Int,is2PetscInt;
66140e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
66240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
663b23bfdefSJunchao Zhang   PetscInt       nPetscComplex=0;
66440e23c03SJunchao Zhang #endif
66540e23c03SJunchao Zhang 
66640e23c03SJunchao Zhang   PetscFunctionBegin;
667b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
668b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
669b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
670b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
671b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
672b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
67340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
674b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
67540e23c03SJunchao Zhang #endif
67640e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
67740e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
678b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
679ffc4695bSBarry Smith   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr);
6805ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
681b23bfdefSJunchao Zhang   link->bs = 1; /* default */
68240e23c03SJunchao Zhang 
683eb02082bSJunchao Zhang   if (is2Int) {
684cd620004SJunchao Zhang     PackInit_PairType_int_int_1_1(link);
685eb02082bSJunchao Zhang     link->bs        = 1;
686eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(int);
6875ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
688eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
6895ad15460SJunchao Zhang     link->unit      = MPI_2INT;
690eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
691cd620004SJunchao Zhang     PackInit_PairType_PetscInt_PetscInt_1_1(link);
692eb02082bSJunchao Zhang     link->bs        = 1;
693eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(PetscInt);
694eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
6955ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
6965ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
697eb02082bSJunchao Zhang   } else if (nPetscReal) {
698b23bfdefSJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
699b23bfdefSJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
700b23bfdefSJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
701b23bfdefSJunchao Zhang     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
702b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
703eb02082bSJunchao Zhang     link->unitbytes = nPetscReal*sizeof(PetscReal);
70440e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
7055ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
706b23bfdefSJunchao Zhang   } else if (nPetscInt) {
707b23bfdefSJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
708b23bfdefSJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
709b23bfdefSJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
710b23bfdefSJunchao Zhang     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
711b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
712eb02082bSJunchao Zhang     link->unitbytes = nPetscInt*sizeof(PetscInt);
713b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
7145ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
715b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
716b23bfdefSJunchao Zhang   } else if (nInt) {
717b23bfdefSJunchao Zhang     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
718b23bfdefSJunchao Zhang     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
719b23bfdefSJunchao Zhang     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
720b23bfdefSJunchao Zhang     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
721b23bfdefSJunchao Zhang     link->bs        = nInt;
722eb02082bSJunchao Zhang     link->unitbytes = nInt*sizeof(int);
723b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
7245ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
725b23bfdefSJunchao Zhang #endif
726b23bfdefSJunchao Zhang   } else if (nSignedChar) {
727b23bfdefSJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
728b23bfdefSJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
729b23bfdefSJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
730b23bfdefSJunchao Zhang     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
731b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
732eb02082bSJunchao Zhang     link->unitbytes = nSignedChar*sizeof(SignedChar);
733b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
7345ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
735b23bfdefSJunchao Zhang   }  else if (nUnsignedChar) {
736b23bfdefSJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
737b23bfdefSJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
738b23bfdefSJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
739b23bfdefSJunchao Zhang     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
740b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
741eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
742b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
7435ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
74440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
745b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
746b23bfdefSJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
747b23bfdefSJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
748b23bfdefSJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
749b23bfdefSJunchao Zhang     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
750b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
751eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
75240e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
7535ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
75440e23c03SJunchao Zhang #endif
75540e23c03SJunchao Zhang   } else {
756b23bfdefSJunchao Zhang     MPI_Aint lb,nbyte;
757ffc4695bSBarry Smith     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr);
75840e23c03SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
759eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
760eb02082bSJunchao Zhang       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
761eb02082bSJunchao Zhang       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
762eb02082bSJunchao Zhang       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
763eb02082bSJunchao Zhang       link->bs        = nbyte;
764b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
765b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
76640e23c03SJunchao Zhang     } else {
767eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
768eb02082bSJunchao Zhang       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
769eb02082bSJunchao Zhang       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
770eb02082bSJunchao Zhang       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
771eb02082bSJunchao Zhang       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
772eb02082bSJunchao Zhang       link->bs        = nInt;
773b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
77440e23c03SJunchao Zhang       link->basicunit = MPI_INT;
77540e23c03SJunchao Zhang     }
7765ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
77740e23c03SJunchao Zhang   }
778b23bfdefSJunchao Zhang 
779ffc4695bSBarry Smith   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);}
78020c24465SJunchao Zhang 
78120c24465SJunchao Zhang   link->Memcpy = PetscSFLinkMemcpy_Host;
78240e23c03SJunchao Zhang   PetscFunctionReturn(0);
78340e23c03SJunchao Zhang }
78440e23c03SJunchao Zhang 
785fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
78640e23c03SJunchao Zhang {
78740e23c03SJunchao Zhang   PetscFunctionBegin;
78840e23c03SJunchao Zhang   *UnpackAndOp = NULL;
78971438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {
79083df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *UnpackAndOp = link->h_UnpackAndInsert;
791eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
792eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
793eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
794eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
795eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
796eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
797eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
798eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
799eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
800eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
801eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
802eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
803eb02082bSJunchao Zhang   }
8047fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
80571438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) {
80683df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *UnpackAndOp = link->d_UnpackAndInsert;
807eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
808eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
809eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
810eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
811eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
812eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
813eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
814eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
815eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
816eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
817eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
818eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
81971438e86SJunchao Zhang   } else if (PetscMemTypeDevice(mtype) && atomic) {
82083df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *UnpackAndOp = link->da_UnpackAndInsert;
821eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
822eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
823eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
824eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
825eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
826eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
827eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
828eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
829eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
830eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
831eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
832eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
833eb02082bSJunchao Zhang   }
834eb02082bSJunchao Zhang #endif
83540e23c03SJunchao Zhang   PetscFunctionReturn(0);
83640e23c03SJunchao Zhang }
83740e23c03SJunchao Zhang 
838fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
83940e23c03SJunchao Zhang {
84040e23c03SJunchao Zhang   PetscFunctionBegin;
841cd620004SJunchao Zhang   *ScatterAndOp = NULL;
84271438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {
84383df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *ScatterAndOp = link->h_ScatterAndInsert;
844cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
845cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
846cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
847cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
848cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
849cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
850cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
851cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
852cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
853cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
854cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
855cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
856eb02082bSJunchao Zhang   }
8577fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
85871438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) {
85983df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *ScatterAndOp = link->d_ScatterAndInsert;
860cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
861cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
862cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
863cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
864cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
865cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
866cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
867cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
868cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
869cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
870cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
871cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
87271438e86SJunchao Zhang   } else if (PetscMemTypeDevice(mtype) && atomic) {
87383df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *ScatterAndOp = link->da_ScatterAndInsert;
874cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
875cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
876cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
877cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
878cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
879cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
880cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
881cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
882cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
883cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
884cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
885cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
886eb02082bSJunchao Zhang   }
887eb02082bSJunchao Zhang #endif
888cd620004SJunchao Zhang   PetscFunctionReturn(0);
889cd620004SJunchao Zhang }
890cd620004SJunchao Zhang 
891fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
892cd620004SJunchao Zhang {
893cd620004SJunchao Zhang   PetscFunctionBegin;
894cd620004SJunchao Zhang   *FetchAndOp = NULL;
895cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
89671438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
8977fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
89871438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
89971438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOp = link->da_FetchAndAdd;
900cd620004SJunchao Zhang #endif
901cd620004SJunchao Zhang   PetscFunctionReturn(0);
902cd620004SJunchao Zhang }
903cd620004SJunchao Zhang 
904fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
905cd620004SJunchao Zhang {
906cd620004SJunchao Zhang   PetscFunctionBegin;
907cd620004SJunchao Zhang   *FetchAndOpLocal = NULL;
908cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
90971438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
9107fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
91171438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
91271438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
913cd620004SJunchao Zhang #endif
914cd620004SJunchao Zhang   PetscFunctionReturn(0);
915cd620004SJunchao Zhang }
916cd620004SJunchao Zhang 
917cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
918cd620004SJunchao Zhang {
919cd620004SJunchao Zhang   PetscErrorCode ierr;
920cd620004SJunchao Zhang   PetscLogDouble flops;
921cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
922cd620004SJunchao Zhang 
923cd620004SJunchao Zhang   PetscFunctionBegin;
92483df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
925cd620004SJunchao Zhang     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
9267fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
92771438e86SJunchao Zhang     if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
928cd620004SJunchao Zhang #endif
929cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
930cd620004SJunchao Zhang   }
931cd620004SJunchao Zhang   PetscFunctionReturn(0);
932cd620004SJunchao Zhang }
933cd620004SJunchao Zhang 
934cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
935cd620004SJunchao Zhang {
936cd620004SJunchao Zhang   PetscLogDouble flops;
937cd620004SJunchao Zhang   PetscErrorCode ierr;
938cd620004SJunchao Zhang 
939cd620004SJunchao Zhang   PetscFunctionBegin;
94083df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
941cd620004SJunchao Zhang     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
9427fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
94371438e86SJunchao Zhang     if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
944cd620004SJunchao Zhang #endif
945cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
946cd620004SJunchao Zhang   }
947cd620004SJunchao Zhang   PetscFunctionReturn(0);
948cd620004SJunchao Zhang }
949cd620004SJunchao Zhang 
950cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
951cd620004SJunchao Zhang   Input Arguments:
952cd620004SJunchao Zhang   +sf      - The StarForest
953cd620004SJunchao Zhang   .link    - The link
954cd620004SJunchao Zhang   .count   - Number of entries to unpack
955cd620004SJunchao Zhang   .start   - The first index, significent when indices=NULL
956cd620004SJunchao Zhang   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
957cd620004SJunchao Zhang   .buf     - A contiguous buffer to unpack from
958cd620004SJunchao Zhang   -op      - Operation after unpack
959cd620004SJunchao Zhang 
960cd620004SJunchao Zhang   Output Arguments:
961cd620004SJunchao Zhang   .data    - The data to unpack to
962cd620004SJunchao Zhang */
963cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
964cd620004SJunchao Zhang {
965cd620004SJunchao Zhang   PetscFunctionBegin;
966cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
967cd620004SJunchao Zhang   {
968cd620004SJunchao Zhang     PetscErrorCode ierr;
969cd620004SJunchao Zhang     PetscInt       i;
970cd620004SJunchao Zhang     PetscMPIInt    n;
971cd620004SJunchao Zhang     if (indices) {
972cd620004SJunchao Zhang       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
973cd620004SJunchao Zhang          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
974cd620004SJunchao Zhang       */
975ffc4695bSBarry Smith       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);}
976cd620004SJunchao Zhang     } else {
977cd620004SJunchao Zhang       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
978ffc4695bSBarry Smith       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr);
979cd620004SJunchao Zhang     }
980cd620004SJunchao Zhang   }
981b458e8f1SJose E. Roman   PetscFunctionReturn(0);
982cd620004SJunchao Zhang #else
983cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
984cd620004SJunchao Zhang #endif
985cd620004SJunchao Zhang }
986cd620004SJunchao Zhang 
987fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
988cd620004SJunchao Zhang {
989cd620004SJunchao Zhang   PetscFunctionBegin;
990cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
991cd620004SJunchao Zhang   {
992cd620004SJunchao Zhang     PetscErrorCode ierr;
993cd620004SJunchao Zhang     PetscInt       i,disp;
994fcc7397dSJunchao Zhang     if (!srcIdx) {
995fcc7397dSJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
996fcc7397dSJunchao Zhang     } else {
997cd620004SJunchao Zhang       for (i=0; i<count; i++) {
998fcc7397dSJunchao Zhang         disp = dstIdx? dstIdx[i] : dstStart + i;
999ffc4695bSBarry Smith         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);
1000fcc7397dSJunchao Zhang       }
1001cd620004SJunchao Zhang     }
1002cd620004SJunchao Zhang   }
1003b458e8f1SJose E. Roman   PetscFunctionReturn(0);
1004cd620004SJunchao Zhang #else
1005cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1006cd620004SJunchao Zhang #endif
1007cd620004SJunchao Zhang }
1008cd620004SJunchao Zhang 
1009cd620004SJunchao Zhang /*=============================================================================
1010cd620004SJunchao Zhang               Pack/Unpack/Fetch/Scatter routines
1011cd620004SJunchao Zhang  ============================================================================*/
1012cd620004SJunchao Zhang 
1013cd620004SJunchao Zhang /* Pack rootdata to rootbuf
1014cd620004SJunchao Zhang   Input Arguments:
1015cd620004SJunchao Zhang   + sf       - The SF this packing works on.
1016cd620004SJunchao Zhang   . link     - It gives the memtype of the roots and also provides root buffer.
1017cd620004SJunchao Zhang   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1018cd620004SJunchao Zhang   - rootdata - Where to read the roots.
1019cd620004SJunchao Zhang 
1020cd620004SJunchao Zhang   Notes:
1021cd620004SJunchao Zhang   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
102271438e86SJunchao Zhang   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
1023cd620004SJunchao Zhang  */
102471438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1025cd620004SJunchao Zhang {
1026cd620004SJunchao Zhang   PetscErrorCode   ierr;
1027cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1028cd620004SJunchao Zhang   PetscInt         count,start;
1029fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1030cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1031fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1032fcc7397dSJunchao Zhang 
1033cd620004SJunchao Zhang   PetscFunctionBegin;
1034cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
103571438e86SJunchao Zhang   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1036fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1037cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1038fcc7397dSJunchao Zhang     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1039cd620004SJunchao Zhang   }
1040cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1041cd620004SJunchao Zhang   PetscFunctionReturn(0);
1042cd620004SJunchao Zhang }
1043cd620004SJunchao Zhang 
1044cd620004SJunchao Zhang /* Pack leafdata to leafbuf */
104571438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1046cd620004SJunchao Zhang {
1047cd620004SJunchao Zhang   PetscErrorCode   ierr;
1048cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1049cd620004SJunchao Zhang   PetscInt         count,start;
1050fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1051cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1052fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1053cd620004SJunchao Zhang 
1054cd620004SJunchao Zhang   PetscFunctionBegin;
1055cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
105671438e86SJunchao Zhang   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1057fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1058cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1059fcc7397dSJunchao Zhang     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1060cd620004SJunchao Zhang   }
1061cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1062cd620004SJunchao Zhang   PetscFunctionReturn(0);
1063cd620004SJunchao Zhang }
1064cd620004SJunchao Zhang 
106571438e86SJunchao Zhang /* Pack rootdata to rootbuf, which are in the same memory space */
106671438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
106771438e86SJunchao Zhang {
106871438e86SJunchao Zhang   PetscErrorCode   ierr;
106971438e86SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
107071438e86SJunchao Zhang 
107171438e86SJunchao Zhang   PetscFunctionBegin;
107271438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
107371438e86SJunchao Zhang     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
107471438e86SJunchao Zhang     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */
107571438e86SJunchao Zhang   }
107671438e86SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
107771438e86SJunchao Zhang   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);}
107871438e86SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
107971438e86SJunchao Zhang   PetscFunctionReturn(0);
108071438e86SJunchao Zhang }
108171438e86SJunchao Zhang /* Pack leafdata to leafbuf, which are in the same memory space */
108271438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
108371438e86SJunchao Zhang {
108471438e86SJunchao Zhang   PetscErrorCode   ierr;
108571438e86SJunchao Zhang 
108671438e86SJunchao Zhang   PetscFunctionBegin;
108771438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
108871438e86SJunchao Zhang     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
108971438e86SJunchao Zhang     if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
109071438e86SJunchao Zhang   }
109171438e86SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
109271438e86SJunchao Zhang   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);}
109371438e86SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
109471438e86SJunchao Zhang   PetscFunctionReturn(0);
109571438e86SJunchao Zhang }
109671438e86SJunchao Zhang 
109771438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1098cd620004SJunchao Zhang {
1099cd620004SJunchao Zhang   PetscErrorCode   ierr;
1100cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1101cd620004SJunchao Zhang   PetscInt         count,start;
1102cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1103fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1104cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1105fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1106cd620004SJunchao Zhang 
1107cd620004SJunchao Zhang   PetscFunctionBegin;
110871438e86SJunchao Zhang   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1109cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1110cd620004SJunchao Zhang     if (UnpackAndOp) {
1111fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1112fcc7397dSJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1113cd620004SJunchao Zhang     } else {
1114fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1115cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1116cd620004SJunchao Zhang     }
1117cd620004SJunchao Zhang   }
1118cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1119cd620004SJunchao Zhang   PetscFunctionReturn(0);
1120cd620004SJunchao Zhang }
1121cd620004SJunchao Zhang 
112271438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1123cd620004SJunchao Zhang {
1124cd620004SJunchao Zhang   PetscErrorCode   ierr;
1125cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1126cd620004SJunchao Zhang   PetscInt         count,start;
1127fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1128cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1129fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1130cd620004SJunchao Zhang 
1131cd620004SJunchao Zhang   PetscFunctionBegin;
113271438e86SJunchao Zhang   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1133cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1134cd620004SJunchao Zhang     if (UnpackAndOp) {
1135fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1136fcc7397dSJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1137cd620004SJunchao Zhang     } else {
1138fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1139cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1140cd620004SJunchao Zhang     }
1141cd620004SJunchao Zhang   }
1142cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
114371438e86SJunchao Zhang   PetscFunctionReturn(0);
114471438e86SJunchao Zhang }
114571438e86SJunchao Zhang /* Unpack rootbuf to rootdata, which are in the same memory space */
114671438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
114771438e86SJunchao Zhang {
114871438e86SJunchao Zhang   PetscErrorCode   ierr;
114971438e86SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
115071438e86SJunchao Zhang 
115171438e86SJunchao Zhang   PetscFunctionBegin;
115271438e86SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
115371438e86SJunchao Zhang   if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);}
1154cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
115571438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
115671438e86SJunchao Zhang     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);}  /* Used by SF nvshmem */
115771438e86SJunchao Zhang     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
115871438e86SJunchao Zhang   }
1159cd620004SJunchao Zhang   PetscFunctionReturn(0);
1160cd620004SJunchao Zhang }
1161cd620004SJunchao Zhang 
116271438e86SJunchao Zhang /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
116371438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
116471438e86SJunchao Zhang {
116571438e86SJunchao Zhang   PetscErrorCode   ierr;
116671438e86SJunchao Zhang 
116771438e86SJunchao Zhang   PetscFunctionBegin;
116871438e86SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
116971438e86SJunchao Zhang   if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);}
117071438e86SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
117171438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
117271438e86SJunchao Zhang     if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);}  /* Used by SF nvshmem */
117371438e86SJunchao Zhang     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);}
117471438e86SJunchao Zhang   }
117571438e86SJunchao Zhang   PetscFunctionReturn(0);
117671438e86SJunchao Zhang }
117771438e86SJunchao Zhang 
117871438e86SJunchao Zhang /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
117971438e86SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op)
1180cd620004SJunchao Zhang {
1181cd620004SJunchao Zhang   PetscErrorCode     ierr;
1182cd620004SJunchao Zhang   const PetscInt     *rootindices = NULL;
1183cd620004SJunchao Zhang   PetscInt           count,start;
1184cd620004SJunchao Zhang   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1185fcc7397dSJunchao Zhang   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1186cd620004SJunchao Zhang   PetscMemType       rootmtype = link->rootmtype;
1187fcc7397dSJunchao Zhang   PetscSFPackOpt     opt = NULL;
1188cd620004SJunchao Zhang 
1189cd620004SJunchao Zhang   PetscFunctionBegin;
1190cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
119171438e86SJunchao Zhang   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1192cd620004SJunchao Zhang     /* Do FetchAndOp on rootdata with rootbuf */
119371438e86SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr);
119471438e86SJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
119571438e86SJunchao Zhang     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr);
1196cd620004SJunchao Zhang   }
119771438e86SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr);
1198cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1199cd620004SJunchao Zhang   PetscFunctionReturn(0);
1200cd620004SJunchao Zhang }
1201cd620004SJunchao Zhang 
120271438e86SJunchao Zhang PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op)
1203cd620004SJunchao Zhang {
1204cd620004SJunchao Zhang   PetscErrorCode       ierr;
1205cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1206cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1207cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1208fcc7397dSJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
120971438e86SJunchao Zhang   PetscMemType         rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype;
1210fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
121171438e86SJunchao Zhang   PetscInt             buflen = sf->leafbuflen[PETSCSF_LOCAL];
121271438e86SJunchao Zhang   char                 *srcbuf = NULL,*dstbuf = NULL;
121371438e86SJunchao Zhang   PetscBool            dstdups;
1214cd620004SJunchao Zhang 
1215*362febeeSStefano Zampini   PetscFunctionBegin;
121671438e86SJunchao Zhang   if (!buflen) PetscFunctionReturn(0);
121771438e86SJunchao Zhang   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
121871438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
1219cd620004SJunchao Zhang       ierr     = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
122071438e86SJunchao Zhang       srcmtype = rootmtype;
122171438e86SJunchao Zhang       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
122271438e86SJunchao Zhang       dstmtype = leafmtype;
122371438e86SJunchao Zhang       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
122471438e86SJunchao Zhang     } else {
122571438e86SJunchao Zhang       ierr     = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
122671438e86SJunchao Zhang       srcmtype = leafmtype;
122771438e86SJunchao Zhang       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
122871438e86SJunchao Zhang       dstmtype = rootmtype;
122971438e86SJunchao Zhang       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
123071438e86SJunchao Zhang     }
123171438e86SJunchao Zhang     ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr);
123271438e86SJunchao Zhang     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
123371438e86SJunchao Zhang     if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);}
123471438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
1235cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1236cd620004SJunchao Zhang     } else {
123771438e86SJunchao Zhang       ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
123871438e86SJunchao Zhang     }
123971438e86SJunchao Zhang   } else {
124071438e86SJunchao Zhang     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
124171438e86SJunchao Zhang     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
124271438e86SJunchao Zhang     ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr);
1243cd620004SJunchao Zhang     if (ScatterAndOp) {
1244fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1245fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
124671438e86SJunchao Zhang       if (direction == PETSCSF_ROOT2LEAF) {
1247fcc7397dSJunchao Zhang         ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1248cd620004SJunchao Zhang       } else {
1249fcc7397dSJunchao Zhang         ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
125071438e86SJunchao Zhang       }
1251cd620004SJunchao Zhang     } else {
1252fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1253fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
125471438e86SJunchao Zhang       if (direction == PETSCSF_ROOT2LEAF) {
125571438e86SJunchao Zhang         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
125671438e86SJunchao Zhang       } else {
1257fcc7397dSJunchao Zhang         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1258cd620004SJunchao Zhang       }
1259cd620004SJunchao Zhang     }
126071438e86SJunchao Zhang   }
1261cd620004SJunchao Zhang   PetscFunctionReturn(0);
1262cd620004SJunchao Zhang }
1263cd620004SJunchao Zhang 
1264cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */
1265cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1266cd620004SJunchao Zhang {
1267cd620004SJunchao Zhang   PetscErrorCode       ierr;
1268cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1269cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1270cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1271fcc7397dSJunchao Zhang   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1272cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1273fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1274cd620004SJunchao Zhang 
1275cd620004SJunchao Zhang   PetscFunctionBegin;
1276cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1277cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1278cd620004SJunchao Zhang     /* The local communication has to go through pack and unpack */
1279cd620004SJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1280cd620004SJunchao Zhang   } else {
1281fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1282fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1283cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1284fcc7397dSJunchao Zhang     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1285cd620004SJunchao Zhang   }
128640e23c03SJunchao Zhang   PetscFunctionReturn(0);
128740e23c03SJunchao Zhang }
128840e23c03SJunchao Zhang 
128940e23c03SJunchao Zhang /*
1290cd620004SJunchao Zhang   Create per-rank pack/unpack optimizations based on indice patterns
129140e23c03SJunchao Zhang 
129240e23c03SJunchao Zhang    Input Parameters:
1293fcc7397dSJunchao Zhang   +  n       - Number of destination ranks
1294eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1295b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
129640e23c03SJunchao Zhang 
129740e23c03SJunchao Zhang    Output Parameters:
1298cd620004SJunchao Zhang   +  opt     - Pack optimizations. NULL if no optimizations.
129940e23c03SJunchao Zhang */
1300cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
130140e23c03SJunchao Zhang {
130240e23c03SJunchao Zhang   PetscErrorCode ierr;
1303fcc7397dSJunchao Zhang   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1304fcc7397dSJunchao Zhang   PetscBool      optimizable = PETSC_TRUE;
130540e23c03SJunchao Zhang   PetscSFPackOpt opt;
130640e23c03SJunchao Zhang 
130740e23c03SJunchao Zhang   PetscFunctionBegin;
1308fcc7397dSJunchao Zhang   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1309fcc7397dSJunchao Zhang   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1310fcc7397dSJunchao Zhang   opt->n      = opt->array[0] = n;
1311fcc7397dSJunchao Zhang   opt->offset = opt->array + 1;
1312fcc7397dSJunchao Zhang   opt->start  = opt->array + n   + 2;
1313fcc7397dSJunchao Zhang   opt->dx     = opt->array + 2*n + 2;
1314fcc7397dSJunchao Zhang   opt->dy     = opt->array + 3*n + 2;
1315fcc7397dSJunchao Zhang   opt->dz     = opt->array + 4*n + 2;
1316fcc7397dSJunchao Zhang   opt->X      = opt->array + 5*n + 2;
1317fcc7397dSJunchao Zhang   opt->Y      = opt->array + 6*n + 2;
1318fcc7397dSJunchao Zhang 
1319fcc7397dSJunchao Zhang   for (r=0; r<n; r++) { /* For each destination rank */
1320fcc7397dSJunchao Zhang     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1321fcc7397dSJunchao Zhang     p     = offset[r];
1322fcc7397dSJunchao Zhang     start = idx[p]; /* First index for this rank */
1323fcc7397dSJunchao Zhang     p++;
1324fcc7397dSJunchao Zhang 
1325fcc7397dSJunchao Zhang     /* Search in X dimension */
1326fcc7397dSJunchao Zhang     for (dx=1; dx<m; dx++,p++) {
1327fcc7397dSJunchao Zhang       if (start+dx != idx[p]) break;
1328b23bfdefSJunchao Zhang     }
1329b23bfdefSJunchao Zhang 
1330fcc7397dSJunchao Zhang     dydz = m/dx;
1331fcc7397dSJunchao Zhang     X    = dydz > 1 ? (idx[p]-start) : dx;
1332fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1333fcc7397dSJunchao Zhang     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1334fcc7397dSJunchao Zhang     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1335fcc7397dSJunchao Zhang       for (i=0; i<dx; i++,p++) {
1336fcc7397dSJunchao Zhang         if (start+X*dy+i != idx[p]) {
1337fcc7397dSJunchao Zhang           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1338fcc7397dSJunchao Zhang           else goto Z_dimension;
133940e23c03SJunchao Zhang         }
134040e23c03SJunchao Zhang       }
134140e23c03SJunchao Zhang     }
134240e23c03SJunchao Zhang 
1343fcc7397dSJunchao Zhang Z_dimension:
1344fcc7397dSJunchao Zhang     dz = m/(dx*dy);
1345fcc7397dSJunchao Zhang     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1346fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1347fcc7397dSJunchao Zhang     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1348fcc7397dSJunchao Zhang     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1349fcc7397dSJunchao Zhang       for (j=0; j<dy; j++) {
1350fcc7397dSJunchao Zhang         for (i=0; i<dx; i++,p++) {
1351fcc7397dSJunchao Zhang           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
135240e23c03SJunchao Zhang         }
135340e23c03SJunchao Zhang       }
135440e23c03SJunchao Zhang     }
1355fcc7397dSJunchao Zhang     opt->start[r] = start;
1356fcc7397dSJunchao Zhang     opt->dx[r]    = dx;
1357fcc7397dSJunchao Zhang     opt->dy[r]    = dy;
1358fcc7397dSJunchao Zhang     opt->dz[r]    = dz;
1359fcc7397dSJunchao Zhang     opt->X[r]     = X;
1360fcc7397dSJunchao Zhang     opt->Y[r]     = Y;
136140e23c03SJunchao Zhang   }
136240e23c03SJunchao Zhang 
1363fcc7397dSJunchao Zhang finish:
1364fcc7397dSJunchao Zhang   /* If not optimizable, free arrays to save memory */
1365fcc7397dSJunchao Zhang   if (!n || !optimizable) {
1366fcc7397dSJunchao Zhang     ierr = PetscFree(opt->array);CHKERRQ(ierr);
136740e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
136840e23c03SJunchao Zhang     *out = NULL;
1369fcc7397dSJunchao Zhang   } else {
1370fcc7397dSJunchao Zhang     opt->offset[0] = 0;
1371fcc7397dSJunchao Zhang     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1372fcc7397dSJunchao Zhang     *out = opt;
1373fcc7397dSJunchao Zhang   }
137440e23c03SJunchao Zhang   PetscFunctionReturn(0);
137540e23c03SJunchao Zhang }
137640e23c03SJunchao Zhang 
137720c24465SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
137840e23c03SJunchao Zhang {
137940e23c03SJunchao Zhang   PetscErrorCode ierr;
138040e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
138140e23c03SJunchao Zhang 
138240e23c03SJunchao Zhang   PetscFunctionBegin;
138340e23c03SJunchao Zhang   if (opt) {
138420c24465SJunchao Zhang     ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr);
138540e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
138640e23c03SJunchao Zhang     *out = NULL;
138740e23c03SJunchao Zhang   }
138840e23c03SJunchao Zhang   PetscFunctionReturn(0);
138940e23c03SJunchao Zhang }
1390cd620004SJunchao Zhang 
1391cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1392cd620004SJunchao Zhang {
1393cd620004SJunchao Zhang   PetscErrorCode ierr;
1394cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1395cd620004SJunchao Zhang   PetscInt       i,j;
1396cd620004SJunchao Zhang 
1397cd620004SJunchao Zhang   PetscFunctionBegin;
1398cd620004SJunchao Zhang   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1399cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Set defaults */
1400cd620004SJunchao Zhang     sf->leafstart[i]   = 0;
1401cd620004SJunchao Zhang     sf->leafcontig[i]  = PETSC_TRUE;
1402cd620004SJunchao Zhang     sf->leafdups[i]    = PETSC_FALSE;
1403cd620004SJunchao Zhang     bas->rootstart[i]  = 0;
1404cd620004SJunchao Zhang     bas->rootcontig[i] = PETSC_TRUE;
1405cd620004SJunchao Zhang     bas->rootdups[i]   = PETSC_FALSE;
1406cd620004SJunchao Zhang   }
1407cd620004SJunchao Zhang 
1408cd620004SJunchao Zhang   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1409cd620004SJunchao Zhang   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1410cd620004SJunchao Zhang 
1411cd620004SJunchao Zhang   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1412cd620004SJunchao Zhang   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1413cd620004SJunchao Zhang 
1414cd620004SJunchao Zhang   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1415cd620004SJunchao Zhang   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1416cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1417cd620004SJunchao Zhang   }
1418cd620004SJunchao Zhang   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1419cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1420cd620004SJunchao Zhang   }
1421cd620004SJunchao Zhang 
1422cd620004SJunchao Zhang   /* If not, see if we can have per-rank optimizations by doing index analysis */
1423cd620004SJunchao Zhang   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1424cd620004SJunchao Zhang   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1425cd620004SJunchao Zhang 
1426cd620004SJunchao Zhang   /* Are root indices for self and remote contiguous? */
1427cd620004SJunchao Zhang   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1428cd620004SJunchao Zhang   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1429cd620004SJunchao Zhang 
1430cd620004SJunchao Zhang   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1431cd620004SJunchao Zhang   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1432cd620004SJunchao Zhang 
1433cd620004SJunchao Zhang   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1434cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1435cd620004SJunchao Zhang   }
1436cd620004SJunchao Zhang   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1437cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1438cd620004SJunchao Zhang   }
1439cd620004SJunchao Zhang 
1440cd620004SJunchao Zhang   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1441cd620004SJunchao Zhang   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1442cd620004SJunchao Zhang 
14437fd2d3dbSJunchao Zhang  #if defined(PETSC_HAVE_DEVICE)
1444cd620004SJunchao Zhang     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1445013b3241SStefano Zampini   if (PetscDefined(HAVE_DEVICE)) {
1446013b3241SStefano Zampini     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1447013b3241SStefano Zampini     if (!sf->leafcontig[0]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1448013b3241SStefano Zampini     if (!sf->leafcontig[1]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1449013b3241SStefano Zampini     if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1450013b3241SStefano Zampini     if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1451013b3241SStefano Zampini   }
1452cd620004SJunchao Zhang #endif
1453cd620004SJunchao Zhang   PetscFunctionReturn(0);
1454cd620004SJunchao Zhang }
1455cd620004SJunchao Zhang 
1456cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1457cd620004SJunchao Zhang {
1458cd620004SJunchao Zhang   PetscErrorCode ierr;
1459cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1460cd620004SJunchao Zhang   PetscInt       i;
1461cd620004SJunchao Zhang 
1462cd620004SJunchao Zhang   PetscFunctionBegin;
1463cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
146420c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
146520c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
14667fd2d3dbSJunchao Zhang    #if defined(PETSC_HAVE_DEVICE)
146720c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
146820c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
14697fd2d3dbSJunchao Zhang    #endif
1470cd620004SJunchao Zhang   }
1471cd620004SJunchao Zhang   PetscFunctionReturn(0);
1472cd620004SJunchao Zhang }
1473