xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 83df288d894f7c58fad30274dc66446f849f17dd)
120c24465SJunchao Zhang #include "petsc/private/sfimpl.h"
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
57fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
67fd2d3dbSJunchao Zhang 
7eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
8eb02082bSJunchao Zhang #include <cuda_runtime.h>
97fd2d3dbSJunchao Zhang #include <petsccublas.h>
10eb02082bSJunchao Zhang #endif
1159af0bd3SScott Kruger #if defined(PETSC_HAVE_HIP)
1259af0bd3SScott Kruger #include <hip/hip_runtime.h>
1359af0bd3SScott Kruger #include <petschipblas.h>
1459af0bd3SScott Kruger #endif
1540e23c03SJunchao Zhang /*
1640e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
1740e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1840e23c03SJunchao Zhang  */
1940e23c03SJunchao Zhang 
20cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
2140e23c03SJunchao Zhang 
22cd620004SJunchao Zhang /* Operations working like s += t */
23cd620004SJunchao Zhang #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
24cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
25cd620004SJunchao Zhang #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
26cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
27cd620004SJunchao Zhang /* Ref MPI MAXLOC */
28cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \
29cd620004SJunchao Zhang   do {                                       \
30cd620004SJunchao Zhang     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
31cd620004SJunchao Zhang     else if (!((s).u op (t).u)) s = t;           \
32cd620004SJunchao Zhang   } while (0)
3340e23c03SJunchao Zhang 
3440e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
3540e23c03SJunchao Zhang 
3640e23c03SJunchao Zhang    Arguments of the macro:
37b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
38fcc7397dSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bsz.
39b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
4040e23c03SJunchao Zhang 
4140e23c03SJunchao Zhang    Arguments of the Pack routine:
42cd620004SJunchao Zhang    +count     Number of indices in idx[].
43fcc7397dSJunchao Zhang    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
44fcc7397dSJunchao Zhang    .opt       Per-pack optimization plan. NULL means no such plan.
45fcc7397dSJunchao Zhang    .idx       Indices of entries to packed.
46eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
47cd620004SJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
48cd620004SJunchao Zhang    -packed    Address of the packed data.
4940e23c03SJunchao Zhang  */
50b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \
51fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
52b23bfdefSJunchao Zhang   {                                                                                                          \
5340e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
54b23bfdefSJunchao Zhang     const Type     *u = (const Type*)unpacked,*u2;                                                           \
55b23bfdefSJunchao Zhang     Type           *p = (Type*)packed,*p2;                                                                   \
56fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
57fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
58b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
5940e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
60fcc7397dSJunchao Zhang     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
61fcc7397dSJunchao Zhang     else if (opt) { /* has optimizations available */                                                        \
62fcc7397dSJunchao Zhang       p2 = p;                                                                                                \
63fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
64fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
65fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
66fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
67fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
68fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
69fcc7397dSJunchao Zhang             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
70fcc7397dSJunchao Zhang             p2  += opt->dx[r]*MBS;                                                                           \
71fcc7397dSJunchao Zhang           }                                                                                                  \
72fcc7397dSJunchao Zhang       }                                                                                                      \
73fcc7397dSJunchao Zhang     } else {                                                                                                 \
74b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
75eb02082bSJunchao Zhang         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
76eb02082bSJunchao Zhang           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
77b23bfdefSJunchao Zhang             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
7840e23c03SJunchao Zhang     }                                                                                                        \
7940e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
8040e23c03SJunchao Zhang   }
8140e23c03SJunchao Zhang 
82cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
83cd620004SJunchao Zhang                 and inserts into a sparse array.
8440e23c03SJunchao Zhang 
8540e23c03SJunchao Zhang    Arguments:
86b23bfdefSJunchao Zhang   .Type       Type of the data
8740e23c03SJunchao Zhang   .BS         Block size for vectorization
88b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
8940e23c03SJunchao Zhang 
9040e23c03SJunchao Zhang   Notes:
9140e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
9240e23c03SJunchao Zhang  */
93cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ)               \
94fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
95b23bfdefSJunchao Zhang   {                                                                                                          \
9640e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
97b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
98fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
99fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
100fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
101b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
10240e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
103b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
104fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
105fcc7397dSJunchao Zhang       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
106fcc7397dSJunchao Zhang     } else if (opt) { /* has optimizations available */                                                      \
107fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
108fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
109fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
110fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
111fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
112fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
113fcc7397dSJunchao Zhang             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
114fcc7397dSJunchao Zhang             p   += opt->dx[r]*MBS;                                                                           \
115fcc7397dSJunchao Zhang           }                                                                                                  \
116fcc7397dSJunchao Zhang       }                                                                                                      \
117fcc7397dSJunchao Zhang     } else {                                                                                                 \
118b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
119b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
120cd620004SJunchao Zhang           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
12140e23c03SJunchao Zhang     }                                                                                                        \
12240e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
12340e23c03SJunchao Zhang   }
12440e23c03SJunchao Zhang 
125cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
12640e23c03SJunchao Zhang 
12740e23c03SJunchao Zhang    Arguments:
128cd620004SJunchao Zhang   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
129b23bfdefSJunchao Zhang   .Type       Type of the data
13040e23c03SJunchao Zhang   .BS         Block size for vectorization
131b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
132cd620004SJunchao Zhang   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
133cd620004SJunchao Zhang   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
13440e23c03SJunchao Zhang  */
135cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
136fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
137b23bfdefSJunchao Zhang   {                                                                                                          \
138cd620004SJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
139fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
140fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
141fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
142b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
14340e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
144b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
145fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
146cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
147cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
148cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
149cd620004SJunchao Zhang             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
150fcc7397dSJunchao Zhang     } else if (opt) { /* idx[] has patterns */                                                               \
151fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
152fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
153fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
154fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
155fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
156fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
157fcc7397dSJunchao Zhang             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
158fcc7397dSJunchao Zhang             p += opt->dx[r]*MBS;                                                                             \
159fcc7397dSJunchao Zhang           }                                                                                                  \
160fcc7397dSJunchao Zhang       }                                                                                                      \
161fcc7397dSJunchao Zhang     } else {                                                                                                 \
162cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
163cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
164cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
165cd620004SJunchao Zhang             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
166cd620004SJunchao Zhang     }                                                                                                        \
167cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
168cd620004SJunchao Zhang   }
169cd620004SJunchao Zhang 
170cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
171fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
172cd620004SJunchao Zhang   {                                                                                                          \
173fcc7397dSJunchao Zhang     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
174fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
175fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
176fcc7397dSJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
177cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
178fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
179fcc7397dSJunchao Zhang       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
180fcc7397dSJunchao Zhang       l = i*MBS;                                                                                             \
181b23bfdefSJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
182b23bfdefSJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
183fcc7397dSJunchao Zhang           tmp = u[r+j*BS+k];                                                                                 \
184fcc7397dSJunchao Zhang           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
185fcc7397dSJunchao Zhang           p[l+j*BS+k] = tmp;                                                                                 \
186cd620004SJunchao Zhang         }                                                                                                    \
187cd620004SJunchao Zhang     }                                                                                                        \
188cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
189cd620004SJunchao Zhang   }
190cd620004SJunchao Zhang 
191cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
192fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
193cd620004SJunchao Zhang   {                                                                                                          \
194fcc7397dSJunchao Zhang     PetscErrorCode ierr;                                                                                     \
195fcc7397dSJunchao Zhang     const Type     *u = (const Type*)src;                                                                    \
196fcc7397dSJunchao Zhang     Type           *v = (Type*)dst;                                                                          \
197fcc7397dSJunchao Zhang     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
198cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
199cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
200cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
201fcc7397dSJunchao Zhang     if (!srcIdx) { /* src is contiguous */                                                                   \
202fcc7397dSJunchao Zhang       u += srcStart*MBS;                                                                                     \
203fcc7397dSJunchao Zhang       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
204fcc7397dSJunchao Zhang     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
205fcc7397dSJunchao Zhang       u += srcOpt->start[0]*MBS;                                                                             \
206fcc7397dSJunchao Zhang       v += dstStart*MBS;                                                                                     \
207fcc7397dSJunchao Zhang       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
208fcc7397dSJunchao Zhang       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
209fcc7397dSJunchao Zhang         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
210fcc7397dSJunchao Zhang           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
211fcc7397dSJunchao Zhang           v += srcOpt->dx[0]*MBS;                                                                            \
212fcc7397dSJunchao Zhang         }                                                                                                    \
213fcc7397dSJunchao Zhang     } else { /* all other cases */                                                                           \
214fcc7397dSJunchao Zhang       for (i=0; i<count; i++) {                                                                              \
215fcc7397dSJunchao Zhang         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
216fcc7397dSJunchao Zhang         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
217cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
218fcc7397dSJunchao Zhang           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
219fcc7397dSJunchao Zhang       }                                                                                                      \
220cd620004SJunchao Zhang     }                                                                                                        \
221cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
222cd620004SJunchao Zhang   }
223cd620004SJunchao Zhang 
224cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
225fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
226cd620004SJunchao Zhang   {                                                                                                          \
227fcc7397dSJunchao Zhang     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
228fcc7397dSJunchao Zhang     const Type     *ldata = (const Type*)leafdata;                                                           \
229fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
230cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
231cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
232cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
233fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
234fcc7397dSJunchao Zhang       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
235fcc7397dSJunchao Zhang       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
236cd620004SJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
237cd620004SJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
238fcc7397dSJunchao Zhang           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
239fcc7397dSJunchao Zhang           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
24040e23c03SJunchao Zhang         }                                                                                                    \
24140e23c03SJunchao Zhang     }                                                                                                        \
24240e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
24340e23c03SJunchao Zhang   }
24440e23c03SJunchao Zhang 
245b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
246b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ)                                                      \
247b23bfdefSJunchao Zhang   DEF_PackFunc(Type,BS,EQ)                                                        \
248cd620004SJunchao Zhang   DEF_UnpackFunc(Type,BS,EQ)                                                      \
249cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
250cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
251eb02082bSJunchao Zhang     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
252eb02082bSJunchao Zhang     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
253cd620004SJunchao Zhang     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
25440e23c03SJunchao Zhang   }
25540e23c03SJunchao Zhang 
256b23bfdefSJunchao Zhang /* Add, Mult ops */
257b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ)                                                       \
258cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
259cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
260cd620004SJunchao Zhang   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
261cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
262cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
263cd620004SJunchao Zhang   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
264cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
265eb02082bSJunchao Zhang     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
266eb02082bSJunchao Zhang     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
267eb02082bSJunchao Zhang     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
268cd620004SJunchao Zhang     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
269cd620004SJunchao Zhang     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
270cd620004SJunchao Zhang     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
27140e23c03SJunchao Zhang   }
27240e23c03SJunchao Zhang 
273b23bfdefSJunchao Zhang /* Max, Min ops */
274b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ)                                                       \
275cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
276cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
277cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
278cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
279cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
280eb02082bSJunchao Zhang     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
281eb02082bSJunchao Zhang     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
282cd620004SJunchao Zhang     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
283cd620004SJunchao Zhang     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
284b23bfdefSJunchao Zhang   }
285b23bfdefSJunchao Zhang 
286b23bfdefSJunchao Zhang /* Logical ops.
287cd620004SJunchao Zhang   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
28840e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
28940e23c03SJunchao Zhang  */
290b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ)                                                       \
291cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
292cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
293cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
294cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
295cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
296cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
297cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
298eb02082bSJunchao Zhang     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
299eb02082bSJunchao Zhang     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
300eb02082bSJunchao Zhang     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
301cd620004SJunchao Zhang     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
302cd620004SJunchao Zhang     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
303cd620004SJunchao Zhang     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
30440e23c03SJunchao Zhang   }
30540e23c03SJunchao Zhang 
306b23bfdefSJunchao Zhang /* Bitwise ops */
307b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ)                                                       \
308cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
309cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
310cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
311cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
312cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
313cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
314cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
315eb02082bSJunchao Zhang     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
316eb02082bSJunchao Zhang     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
317eb02082bSJunchao Zhang     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
318cd620004SJunchao Zhang     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
319cd620004SJunchao Zhang     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
320cd620004SJunchao Zhang     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
32140e23c03SJunchao Zhang   }
32240e23c03SJunchao Zhang 
323cd620004SJunchao Zhang /* Maxloc, Minloc ops */
324cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ)                                                      \
325cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
326cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
327cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
328cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
329cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
330cd620004SJunchao Zhang     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
331cd620004SJunchao Zhang     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
332cd620004SJunchao Zhang     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
333cd620004SJunchao Zhang     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
33440e23c03SJunchao Zhang   }
33540e23c03SJunchao Zhang 
336b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ)                                               \
337b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
338b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
339b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
340b23bfdefSJunchao Zhang   DEF_Log(Type,BS,EQ)                                                             \
341b23bfdefSJunchao Zhang   DEF_Bit(Type,BS,EQ)                                                             \
342cd620004SJunchao Zhang   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
343b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
344b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
345b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
346b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
347b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
34840e23c03SJunchao Zhang   }
34940e23c03SJunchao Zhang 
350b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ)                                                  \
351b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
352b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
353b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
354cd620004SJunchao Zhang   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
355b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
356b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
357b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
358b23bfdefSJunchao Zhang   }
35940e23c03SJunchao Zhang 
36040e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
361b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ)                                               \
362b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
363b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
364cd620004SJunchao Zhang   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
365b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
367b23bfdefSJunchao Zhang   }
36840e23c03SJunchao Zhang #endif
369b23bfdefSJunchao Zhang 
370b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ)                                                  \
371b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
372cd620004SJunchao Zhang   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
373b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374b23bfdefSJunchao Zhang   }
375b23bfdefSJunchao Zhang 
376b23bfdefSJunchao Zhang /* Maxloc, Minloc */
377cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ)                                                  \
378cd620004SJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
379cd620004SJunchao Zhang   DEF_Xloc(Type,BS,EQ)                                                            \
380cd620004SJunchao Zhang   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
381cd620004SJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
382cd620004SJunchao Zhang     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
383b23bfdefSJunchao Zhang   }
384b23bfdefSJunchao Zhang 
385b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
386b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
387b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
388b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
389b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
390b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
391b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
392b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
393b23bfdefSJunchao Zhang 
394b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
395b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1)
396b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1)
397b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1)
398b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1)
399b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0)
400b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0)
401b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0)
402b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0)
403b23bfdefSJunchao Zhang #endif
404b23bfdefSJunchao Zhang 
405b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */
406b23bfdefSJunchao Zhang typedef signed char SignedChar;
407b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1)
408b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1)
409b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1)
410b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1)
411b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0)
412b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0)
413b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0)
414b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0)
415b23bfdefSJunchao Zhang 
416b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar;
417b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1)
418b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1)
419b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1)
420b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1)
421b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0)
422b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0)
423b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0)
424b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0)
425b23bfdefSJunchao Zhang 
426b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1)
427b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1)
428b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1)
429b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1)
430b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0)
431b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0)
432b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0)
433b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0)
434b23bfdefSJunchao Zhang 
435b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
436b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1)
437b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1)
438b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1)
439b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1)
440b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0)
441b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0)
442b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0)
443b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0)
444b23bfdefSJunchao Zhang #endif
445b23bfdefSJunchao Zhang 
446cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2
447cd620004SJunchao Zhang typedef struct {int u; int i;}           PairType(int,int);
448cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
449cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1)
450cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1)
451b23bfdefSJunchao Zhang 
452b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */
453b23bfdefSJunchao Zhang DEF_DumbType(char,1,1)
454b23bfdefSJunchao Zhang DEF_DumbType(char,2,1)
455b23bfdefSJunchao Zhang DEF_DumbType(char,4,1)
456b23bfdefSJunchao Zhang DEF_DumbType(char,1,0)
457b23bfdefSJunchao Zhang DEF_DumbType(char,2,0)
458b23bfdefSJunchao Zhang DEF_DumbType(char,4,0)
459b23bfdefSJunchao Zhang 
460eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
461b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1)
462b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1)
463b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1)
464b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1)
465b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0)
466b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0)
467b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0)
468b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0)
46940e23c03SJunchao Zhang 
47040e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
47140e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
47240e23c03SJunchao Zhang {
47340e23c03SJunchao Zhang   int ierr;
47440e23c03SJunchao Zhang   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
47540e23c03SJunchao Zhang   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
47640e23c03SJunchao Zhang   return MPI_SUCCESS;
47740e23c03SJunchao Zhang }
47840e23c03SJunchao Zhang #endif
47940e23c03SJunchao Zhang 
480cd620004SJunchao Zhang /*
481cd620004SJunchao Zhang    The routine Creates a communication link for the given operation. It first looks up its link cache. If
482cd620004SJunchao Zhang    there is a free & suitable one, it uses it. Otherwise it creates a new one.
483cd620004SJunchao Zhang 
484cd620004SJunchao Zhang    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
485cd620004SJunchao Zhang    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
486cd620004SJunchao Zhang    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
487cd620004SJunchao Zhang    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
488cd620004SJunchao Zhang 
489cd620004SJunchao Zhang    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
490cd620004SJunchao Zhang 
491cd620004SJunchao Zhang    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
492cd620004SJunchao Zhang 
493cd620004SJunchao Zhang    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
494cd620004SJunchao Zhang    need pack/unpack data.
495cd620004SJunchao Zhang */
496a247e58cSJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
49740e23c03SJunchao Zhang {
49840e23c03SJunchao Zhang   PetscErrorCode    ierr;
499cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
500cd620004SJunchao Zhang   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
501cd620004SJunchao Zhang   PetscSFLink       *p,link;
502cd620004SJunchao Zhang   PetscSFDirection  direction;
503cd620004SJunchao Zhang   MPI_Request       *reqs = NULL;
504cd620004SJunchao Zhang   PetscBool         match,rootdirect[2],leafdirect[2];
505a247e58cSJunchao Zhang   PetscMemType      rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1*/
506a247e58cSJunchao Zhang   PetscMemType      leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
507cd620004SJunchao Zhang   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
508cd620004SJunchao Zhang   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
509cd620004SJunchao Zhang 
510cd620004SJunchao Zhang   PetscFunctionBegin;
511cd620004SJunchao Zhang   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
512cd620004SJunchao Zhang 
513cd620004SJunchao Zhang   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
514cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
515cd620004SJunchao Zhang     if (sfop == PETSCSF_BCAST) {
516cd620004SJunchao Zhang       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
517*83df288dSJunchao Zhang       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
518cd620004SJunchao Zhang     } else if (sfop == PETSCSF_REDUCE) {
519cd620004SJunchao Zhang       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
520*83df288dSJunchao Zhang       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
521cd620004SJunchao Zhang     } else { /* PETSCSF_FETCH */
522cd620004SJunchao Zhang       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
523cd620004SJunchao Zhang       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
524cd620004SJunchao Zhang     }
525cd620004SJunchao Zhang   }
526cd620004SJunchao Zhang 
527c2a741eeSJunchao Zhang   if (sf->use_gpu_aware_mpi) {
528cd620004SJunchao Zhang     rootmtype_mpi = rootmtype;
529cd620004SJunchao Zhang     leafmtype_mpi = leafmtype;
530cd620004SJunchao Zhang   } else {
531cd620004SJunchao Zhang     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
532cd620004SJunchao Zhang   }
533cd620004SJunchao Zhang   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
534cd620004SJunchao Zhang   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
535cd620004SJunchao Zhang   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
536cd620004SJunchao Zhang 
537cd620004SJunchao Zhang   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
538cd620004SJunchao Zhang   nrootreqs = bas->nrootreqs;
539cd620004SJunchao Zhang   nleafreqs = sf->nleafreqs;
540cd620004SJunchao Zhang 
541cd620004SJunchao Zhang   /* Look for free links in cache */
542cd620004SJunchao Zhang   for (p=&bas->avail; (link=*p); p=&link->next) {
543cd620004SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
544cd620004SJunchao Zhang     if (match) {
545cd620004SJunchao Zhang       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
546cd620004SJunchao Zhang          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
547cd620004SJunchao Zhang       */
548cd620004SJunchao Zhang       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
549cd620004SJunchao Zhang         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
550ffc4695bSBarry Smith         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}}
551cd620004SJunchao Zhang         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
552cd620004SJunchao Zhang       }
553cd620004SJunchao Zhang       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
554cd620004SJunchao Zhang         reqs = link->leafreqs[direction][leafmtype][1];
555ffc4695bSBarry Smith         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRMPI(ierr);}}
556cd620004SJunchao Zhang         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
557cd620004SJunchao Zhang       }
558cd620004SJunchao Zhang       *p = link->next; /* Remove from available list */
559cd620004SJunchao Zhang       goto found;
560cd620004SJunchao Zhang     }
561cd620004SJunchao Zhang   }
562cd620004SJunchao Zhang 
563cd620004SJunchao Zhang   ierr = PetscNew(&link);CHKERRQ(ierr);
564cd620004SJunchao Zhang   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
565cd620004SJunchao Zhang   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
566cd620004SJunchao Zhang 
567cd620004SJunchao Zhang   nreqs = (nrootreqs+nleafreqs)*8;
568cd620004SJunchao Zhang   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
569cd620004SJunchao Zhang   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
570cd620004SJunchao Zhang 
571cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Two communication directions */
572cd620004SJunchao Zhang     for (j=0; j<2; j++) { /* Two memory types */
573cd620004SJunchao Zhang       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
574cd620004SJunchao Zhang         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
575cd620004SJunchao Zhang         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
576cd620004SJunchao Zhang       }
577cd620004SJunchao Zhang     }
578cd620004SJunchao Zhang   }
579cd620004SJunchao Zhang 
580cd620004SJunchao Zhang found:
58120c24465SJunchao Zhang 
5827fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
583a247e58cSJunchao Zhang   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
58420c24465SJunchao Zhang     #if defined(PETSC_HAVE_CUDA)
58520c24465SJunchao Zhang       if (sf->backend == PETSCSF_BACKEND_CUDA)   {ierr = PetscSFLinkSetUp_Cuda(sf,link,unit);CHKERRQ(ierr);}
58620c24465SJunchao Zhang     #endif
58759af0bd3SScott Kruger     #if defined(PETSC_HAVE_HIP)
58859af0bd3SScott Kruger       if (sf->backend == PETSCSF_BACKEND_HIP)   {ierr = PetscSFLinkSetUp_Hip(sf,link,unit);CHKERRQ(ierr);}
58959af0bd3SScott Kruger     #endif
59020c24465SJunchao Zhang     #if defined(PETSC_HAVE_KOKKOS)
59120c24465SJunchao Zhang       if (sf->backend == PETSCSF_BACKEND_KOKKOS) {ierr = PetscSFLinkSetUp_Kokkos(sf,link,unit);CHKERRQ(ierr);}
59220c24465SJunchao Zhang     #endif
59320c24465SJunchao Zhang   }
5947fd2d3dbSJunchao Zhang #endif
595cd620004SJunchao Zhang 
596cd620004SJunchao Zhang   /* Allocate buffers along root/leafdata */
597cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
598cd620004SJunchao Zhang     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
599cd620004SJunchao Zhang     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
600cd620004SJunchao Zhang     if (bas->rootbuflen[i]) {
601cd620004SJunchao Zhang       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
602cd620004SJunchao Zhang         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
603cd620004SJunchao Zhang       } else { /* Have to have a separate rootbuf */
604cd620004SJunchao Zhang         if (!link->rootbuf_alloc[i][rootmtype]) {
60520c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
606cd620004SJunchao Zhang         }
607cd620004SJunchao Zhang         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
608cd620004SJunchao Zhang       }
609cd620004SJunchao Zhang     }
610cd620004SJunchao Zhang 
611cd620004SJunchao Zhang     if (sf->leafbuflen[i]) {
612cd620004SJunchao Zhang       if (leafdirect[i]) {
613cd620004SJunchao Zhang         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
614cd620004SJunchao Zhang       } else {
615cd620004SJunchao Zhang         if (!link->leafbuf_alloc[i][leafmtype]) {
61620c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
617cd620004SJunchao Zhang         }
618cd620004SJunchao Zhang         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
619cd620004SJunchao Zhang       }
620cd620004SJunchao Zhang     }
621cd620004SJunchao Zhang   }
622cd620004SJunchao Zhang 
6237fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
624cd620004SJunchao Zhang   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
625cd620004SJunchao Zhang   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
626cd620004SJunchao Zhang     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
627cd620004SJunchao Zhang       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
628cd620004SJunchao Zhang     }
629cd620004SJunchao Zhang     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
630cd620004SJunchao Zhang   }
631cd620004SJunchao Zhang   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
632cd620004SJunchao Zhang     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
633cd620004SJunchao Zhang       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
634cd620004SJunchao Zhang     }
635cd620004SJunchao Zhang     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
636cd620004SJunchao Zhang   }
6377fd2d3dbSJunchao Zhang #endif
638cd620004SJunchao Zhang 
639cd620004SJunchao Zhang   /* Set `current` state of the link. They may change between different SF invocations with the same link */
640cd620004SJunchao Zhang   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
641cd620004SJunchao Zhang     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
642cd620004SJunchao Zhang     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
643cd620004SJunchao Zhang   }
644cd620004SJunchao Zhang 
645cd620004SJunchao Zhang   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
646cd620004SJunchao Zhang   link->leafdata = leafdata;
647cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
648cd620004SJunchao Zhang     link->rootdirect[i] = rootdirect[i];
649cd620004SJunchao Zhang     link->leafdirect[i] = leafdirect[i];
650cd620004SJunchao Zhang   }
651cd620004SJunchao Zhang   link->rootdirect_mpi  = rootdirect_mpi;
652cd620004SJunchao Zhang   link->leafdirect_mpi  = leafdirect_mpi;
653cd620004SJunchao Zhang   link->rootmtype       = rootmtype;
654cd620004SJunchao Zhang   link->leafmtype       = leafmtype;
655cd620004SJunchao Zhang   link->rootmtype_mpi   = rootmtype_mpi;
656cd620004SJunchao Zhang   link->leafmtype_mpi   = leafmtype_mpi;
657cd620004SJunchao Zhang 
658cd620004SJunchao Zhang   link->next            = bas->inuse;
659cd620004SJunchao Zhang   bas->inuse            = link;
660cd620004SJunchao Zhang   *mylink               = link;
661cd620004SJunchao Zhang   PetscFunctionReturn(0);
662cd620004SJunchao Zhang }
663cd620004SJunchao Zhang 
664cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
665cd620004SJunchao Zhang    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
666cd620004SJunchao Zhang */
667cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
668cd620004SJunchao Zhang {
669cd620004SJunchao Zhang   PetscErrorCode       ierr;
670cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
671cd620004SJunchao Zhang   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
672cd620004SJunchao Zhang   const PetscInt       *rootoffset,*leafoffset;
673cd620004SJunchao Zhang   PetscMPIInt          n;
674cd620004SJunchao Zhang   MPI_Aint             disp;
675cd620004SJunchao Zhang   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
676cd620004SJunchao Zhang   MPI_Datatype         unit = link->unit;
677cd620004SJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
678cd620004SJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
679cd620004SJunchao Zhang 
680cd620004SJunchao Zhang   PetscFunctionBegin;
681cd620004SJunchao Zhang   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
682cd620004SJunchao Zhang   if (sf->persistent) {
683cd620004SJunchao Zhang     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
684cd620004SJunchao Zhang       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
685cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
686cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
687cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
688cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
689ffc4695bSBarry Smith           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
690cd620004SJunchao Zhang         }
691cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
692cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
693cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
694cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
695ffc4695bSBarry Smith           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr);
696cd620004SJunchao Zhang         }
697cd620004SJunchao Zhang       }
698cd620004SJunchao Zhang       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
699cd620004SJunchao Zhang     }
700cd620004SJunchao Zhang 
701cd620004SJunchao Zhang     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
702cd620004SJunchao Zhang       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
703cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
704cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
705cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
706cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
707ffc4695bSBarry Smith           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
708cd620004SJunchao Zhang         }
709cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
710cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
711cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
712cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
713ffc4695bSBarry Smith           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr);
714cd620004SJunchao Zhang         }
715cd620004SJunchao Zhang       }
716cd620004SJunchao Zhang       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
717cd620004SJunchao Zhang     }
718cd620004SJunchao Zhang   }
719cd620004SJunchao Zhang   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
720cd620004SJunchao Zhang   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
721cd620004SJunchao Zhang   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
722cd620004SJunchao Zhang   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
723cd620004SJunchao Zhang   PetscFunctionReturn(0);
724cd620004SJunchao Zhang }
725cd620004SJunchao Zhang 
726cd620004SJunchao Zhang 
727cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
728cd620004SJunchao Zhang {
729cd620004SJunchao Zhang   PetscErrorCode    ierr;
730cd620004SJunchao Zhang   PetscSFLink       link,*p;
73140e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
73240e23c03SJunchao Zhang 
73340e23c03SJunchao Zhang   PetscFunctionBegin;
73440e23c03SJunchao Zhang   /* Look for types in cache */
73540e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
73640e23c03SJunchao Zhang     PetscBool match;
73740e23c03SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
738637e6665SJunchao Zhang     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
73940e23c03SJunchao Zhang       switch (cmode) {
74040e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
74140e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
74240e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
74340e23c03SJunchao Zhang       }
74440e23c03SJunchao Zhang       *mylink = link;
74540e23c03SJunchao Zhang       PetscFunctionReturn(0);
74640e23c03SJunchao Zhang     }
74740e23c03SJunchao Zhang   }
74840e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
74940e23c03SJunchao Zhang }
75040e23c03SJunchao Zhang 
751cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
75240e23c03SJunchao Zhang {
75340e23c03SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
75440e23c03SJunchao Zhang 
75540e23c03SJunchao Zhang   PetscFunctionBegin;
756637e6665SJunchao Zhang   (*link)->rootdata = NULL;
757637e6665SJunchao Zhang   (*link)->leafdata = NULL;
75840e23c03SJunchao Zhang   (*link)->next     = bas->avail;
75940e23c03SJunchao Zhang   bas->avail        = *link;
76040e23c03SJunchao Zhang   *link             = NULL;
76140e23c03SJunchao Zhang   PetscFunctionReturn(0);
76240e23c03SJunchao Zhang }
76340e23c03SJunchao Zhang 
764cd620004SJunchao Zhang /* Destroy all links chained in 'avail' */
765cd620004SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
766eb02082bSJunchao Zhang {
767eb02082bSJunchao Zhang   PetscErrorCode    ierr;
768cd620004SJunchao Zhang   PetscSFLink       link = *avail,next;
769cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
770cd620004SJunchao Zhang   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
771eb02082bSJunchao Zhang 
772eb02082bSJunchao Zhang   PetscFunctionBegin;
773eb02082bSJunchao Zhang   for (; link; link=next) {
774eb02082bSJunchao Zhang     next = link->next;
775ffc4695bSBarry Smith     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);}
776cd620004SJunchao Zhang     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
777ffc4695bSBarry Smith       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);}
778eb02082bSJunchao Zhang     }
779eb02082bSJunchao Zhang     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
780cd620004SJunchao Zhang     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
781cd620004SJunchao Zhang       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
782cd620004SJunchao Zhang       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
7837fd2d3dbSJunchao Zhang       #if defined(PETSC_HAVE_DEVICE)
78420c24465SJunchao Zhang       ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
78520c24465SJunchao Zhang       ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
7867fd2d3dbSJunchao Zhang       #endif
787cd620004SJunchao Zhang     }
7887fd2d3dbSJunchao Zhang   #if defined(PETSC_HAVE_DEVICE)
7897fd2d3dbSJunchao Zhang     if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);}
79059af0bd3SScott Kruger    /*TODO:  Make runtime */
7917fd2d3dbSJunchao Zhang    #if defined(PETSC_HAVE_CUDA)
7927fd2d3dbSJunchao Zhang     if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;}
7937fd2d3dbSJunchao Zhang    #elif defined(PETSC_HAVE_HIP)
79459af0bd3SScott Kruger     if (link->stream) {hipError_t  cerr = hipStreamDestroy(link->stream);CHKERRHIP(cerr); link->stream = NULL;}
7957fd2d3dbSJunchao Zhang    #endif
7967fd2d3dbSJunchao Zhang   #endif
797eb02082bSJunchao Zhang     ierr = PetscFree(link);CHKERRQ(ierr);
798eb02082bSJunchao Zhang   }
799eb02082bSJunchao Zhang   *avail = NULL;
800eb02082bSJunchao Zhang   PetscFunctionReturn(0);
801eb02082bSJunchao Zhang }
802eb02082bSJunchao Zhang 
8039d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
804cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
8059d1c8addSJunchao Zhang {
8069d1c8addSJunchao Zhang   PetscErrorCode    ierr;
807cd620004SJunchao Zhang   PetscSFLink       link,*p;
8089d1c8addSJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
8099d1c8addSJunchao Zhang   PetscBool         match;
8109d1c8addSJunchao Zhang 
8119d1c8addSJunchao Zhang   PetscFunctionBegin;
812b458e8f1SJose E. Roman   if (PetscDefined(USE_DEBUG)) {
81318fb5014SJunchao Zhang     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
81418fb5014SJunchao Zhang        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
81518fb5014SJunchao Zhang     */
816637e6665SJunchao Zhang     if (rootdata || leafdata) {
8179d1c8addSJunchao Zhang       for (p=&bas->inuse; (link=*p); p=&link->next) {
8189d1c8addSJunchao Zhang         ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
819cd620004SJunchao Zhang         if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
8209d1c8addSJunchao Zhang       }
82118fb5014SJunchao Zhang     }
822b458e8f1SJose E. Roman   }
8239d1c8addSJunchao Zhang   PetscFunctionReturn(0);
8249d1c8addSJunchao Zhang }
8259d1c8addSJunchao Zhang 
82620c24465SJunchao Zhang static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
82720c24465SJunchao Zhang {
82820c24465SJunchao Zhang   PetscFunctionBegin;
82920c24465SJunchao Zhang   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
83020c24465SJunchao Zhang   PetscFunctionReturn(0);
83120c24465SJunchao Zhang }
83220c24465SJunchao Zhang 
83320c24465SJunchao Zhang 
834cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
83540e23c03SJunchao Zhang {
83640e23c03SJunchao Zhang   PetscErrorCode ierr;
837b23bfdefSJunchao Zhang   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
838b23bfdefSJunchao Zhang   PetscBool      is2Int,is2PetscInt;
83940e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
84040e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
841b23bfdefSJunchao Zhang   PetscInt       nPetscComplex=0;
84240e23c03SJunchao Zhang #endif
84340e23c03SJunchao Zhang 
84440e23c03SJunchao Zhang   PetscFunctionBegin;
845b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
846b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
847b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
848b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
849b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
850b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
85140e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
852b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
85340e23c03SJunchao Zhang #endif
85440e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
85540e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
856b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
857ffc4695bSBarry Smith   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr);
8585ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
859b23bfdefSJunchao Zhang   link->bs = 1; /* default */
86040e23c03SJunchao Zhang 
861eb02082bSJunchao Zhang   if (is2Int) {
862cd620004SJunchao Zhang     PackInit_PairType_int_int_1_1(link);
863eb02082bSJunchao Zhang     link->bs        = 1;
864eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(int);
8655ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
866eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
8675ad15460SJunchao Zhang     link->unit      = MPI_2INT;
868eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
869cd620004SJunchao Zhang     PackInit_PairType_PetscInt_PetscInt_1_1(link);
870eb02082bSJunchao Zhang     link->bs        = 1;
871eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(PetscInt);
872eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
8735ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
8745ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
875eb02082bSJunchao Zhang   } else if (nPetscReal) {
876b23bfdefSJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
877b23bfdefSJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
878b23bfdefSJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
879b23bfdefSJunchao Zhang     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
880b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
881eb02082bSJunchao Zhang     link->unitbytes = nPetscReal*sizeof(PetscReal);
88240e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
8835ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
884b23bfdefSJunchao Zhang   } else if (nPetscInt) {
885b23bfdefSJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
886b23bfdefSJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
887b23bfdefSJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
888b23bfdefSJunchao Zhang     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
889b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
890eb02082bSJunchao Zhang     link->unitbytes = nPetscInt*sizeof(PetscInt);
891b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
8925ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
893b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
894b23bfdefSJunchao Zhang   } else if (nInt) {
895b23bfdefSJunchao Zhang     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
896b23bfdefSJunchao Zhang     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
897b23bfdefSJunchao Zhang     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
898b23bfdefSJunchao Zhang     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
899b23bfdefSJunchao Zhang     link->bs        = nInt;
900eb02082bSJunchao Zhang     link->unitbytes = nInt*sizeof(int);
901b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
9025ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
903b23bfdefSJunchao Zhang #endif
904b23bfdefSJunchao Zhang   } else if (nSignedChar) {
905b23bfdefSJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
906b23bfdefSJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
907b23bfdefSJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
908b23bfdefSJunchao Zhang     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
909b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
910eb02082bSJunchao Zhang     link->unitbytes = nSignedChar*sizeof(SignedChar);
911b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
9125ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
913b23bfdefSJunchao Zhang   }  else if (nUnsignedChar) {
914b23bfdefSJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
915b23bfdefSJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
916b23bfdefSJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
917b23bfdefSJunchao Zhang     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
918b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
919eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
920b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
9215ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
92240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
923b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
924b23bfdefSJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
925b23bfdefSJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
926b23bfdefSJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
927b23bfdefSJunchao Zhang     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
928b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
929eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
93040e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
9315ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
93240e23c03SJunchao Zhang #endif
93340e23c03SJunchao Zhang   } else {
934b23bfdefSJunchao Zhang     MPI_Aint lb,nbyte;
935ffc4695bSBarry Smith     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr);
93640e23c03SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
937eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
938eb02082bSJunchao Zhang       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
939eb02082bSJunchao Zhang       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
940eb02082bSJunchao Zhang       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
941eb02082bSJunchao Zhang       link->bs        = nbyte;
942b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
943b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
94440e23c03SJunchao Zhang     } else {
945eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
946eb02082bSJunchao Zhang       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
947eb02082bSJunchao Zhang       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
948eb02082bSJunchao Zhang       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
949eb02082bSJunchao Zhang       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
950eb02082bSJunchao Zhang       link->bs        = nInt;
951b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
95240e23c03SJunchao Zhang       link->basicunit = MPI_INT;
95340e23c03SJunchao Zhang     }
9545ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
95540e23c03SJunchao Zhang   }
956b23bfdefSJunchao Zhang 
957ffc4695bSBarry Smith   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);}
95820c24465SJunchao Zhang 
95920c24465SJunchao Zhang   link->Memcpy = PetscSFLinkMemcpy_Host;
96040e23c03SJunchao Zhang   PetscFunctionReturn(0);
96140e23c03SJunchao Zhang }
96240e23c03SJunchao Zhang 
963fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
96440e23c03SJunchao Zhang {
96540e23c03SJunchao Zhang   PetscFunctionBegin;
96640e23c03SJunchao Zhang   *UnpackAndOp = NULL;
967eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
968*83df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *UnpackAndOp = link->h_UnpackAndInsert;
969eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
970eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
971eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
972eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
973eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
974eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
975eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
976eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
977eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
978eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
979eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
980eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
981eb02082bSJunchao Zhang   }
9827fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
983eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
984*83df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *UnpackAndOp = link->d_UnpackAndInsert;
985eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
986eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
987eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
988eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
989eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
990eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
991eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
992eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
993eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
994eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
995eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
996eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
997eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
998*83df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *UnpackAndOp = link->da_UnpackAndInsert;
999eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
1000eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
1001eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
1002eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
1003eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
1004eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
1005eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
1006eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
1007eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
1008eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
1009eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
1010eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
1011eb02082bSJunchao Zhang   }
1012eb02082bSJunchao Zhang #endif
101340e23c03SJunchao Zhang   PetscFunctionReturn(0);
101440e23c03SJunchao Zhang }
101540e23c03SJunchao Zhang 
1016fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
101740e23c03SJunchao Zhang {
101840e23c03SJunchao Zhang   PetscFunctionBegin;
1019cd620004SJunchao Zhang   *ScatterAndOp = NULL;
1020eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
1021*83df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *ScatterAndOp = link->h_ScatterAndInsert;
1022cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
1023cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
1024cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
1025cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
1026cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
1027cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
1028cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
1029cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
1030cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
1031cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
1032cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
1033cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
1034eb02082bSJunchao Zhang   }
10357fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1036eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
1037*83df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *ScatterAndOp = link->d_ScatterAndInsert;
1038cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
1039cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
1040cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1041cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1042cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1043cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1044cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1045cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1046cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1047cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1048cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1049cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1050eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1051*83df288dSJunchao Zhang     if      (op == MPI_REPLACE)               *ScatterAndOp = link->da_ScatterAndInsert;
1052cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1053cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1054cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1055cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1056cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1057cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1058cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1059cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1060cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1061cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1062cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1063cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1064eb02082bSJunchao Zhang   }
1065eb02082bSJunchao Zhang #endif
1066cd620004SJunchao Zhang   PetscFunctionReturn(0);
1067cd620004SJunchao Zhang }
1068cd620004SJunchao Zhang 
1069fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1070cd620004SJunchao Zhang {
1071cd620004SJunchao Zhang   PetscFunctionBegin;
1072cd620004SJunchao Zhang   *FetchAndOp = NULL;
1073cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1074cd620004SJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
10757fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1076cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1077cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1078cd620004SJunchao Zhang #endif
1079cd620004SJunchao Zhang   PetscFunctionReturn(0);
1080cd620004SJunchao Zhang }
1081cd620004SJunchao Zhang 
1082fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1083cd620004SJunchao Zhang {
1084cd620004SJunchao Zhang   PetscFunctionBegin;
1085cd620004SJunchao Zhang   *FetchAndOpLocal = NULL;
1086cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1087cd620004SJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
10887fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1089cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1090cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1091cd620004SJunchao Zhang #endif
1092cd620004SJunchao Zhang   PetscFunctionReturn(0);
1093cd620004SJunchao Zhang }
1094cd620004SJunchao Zhang 
1095cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1096cd620004SJunchao Zhang {
1097cd620004SJunchao Zhang   PetscErrorCode ierr;
1098cd620004SJunchao Zhang   PetscLogDouble flops;
1099cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1100cd620004SJunchao Zhang 
1101cd620004SJunchao Zhang 
1102cd620004SJunchao Zhang   PetscFunctionBegin;
1103*83df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1104cd620004SJunchao Zhang     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
11057fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1106cd620004SJunchao Zhang     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1107cd620004SJunchao Zhang #endif
1108cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1109cd620004SJunchao Zhang   }
1110cd620004SJunchao Zhang   PetscFunctionReturn(0);
1111cd620004SJunchao Zhang }
1112cd620004SJunchao Zhang 
1113cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1114cd620004SJunchao Zhang {
1115cd620004SJunchao Zhang   PetscLogDouble flops;
1116cd620004SJunchao Zhang   PetscErrorCode ierr;
1117cd620004SJunchao Zhang 
1118cd620004SJunchao Zhang   PetscFunctionBegin;
1119*83df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1120cd620004SJunchao Zhang     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
11217fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1122cd620004SJunchao Zhang     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1123cd620004SJunchao Zhang #endif
1124cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1125cd620004SJunchao Zhang   }
1126cd620004SJunchao Zhang   PetscFunctionReturn(0);
1127cd620004SJunchao Zhang }
1128cd620004SJunchao Zhang 
1129cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1130cd620004SJunchao Zhang   Input Arguments:
1131cd620004SJunchao Zhang   +sf      - The StarForest
1132cd620004SJunchao Zhang   .link    - The link
1133cd620004SJunchao Zhang   .count   - Number of entries to unpack
1134cd620004SJunchao Zhang   .start   - The first index, significent when indices=NULL
1135cd620004SJunchao Zhang   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1136cd620004SJunchao Zhang   .buf     - A contiguous buffer to unpack from
1137cd620004SJunchao Zhang   -op      - Operation after unpack
1138cd620004SJunchao Zhang 
1139cd620004SJunchao Zhang   Output Arguments:
1140cd620004SJunchao Zhang   .data    - The data to unpack to
1141cd620004SJunchao Zhang */
1142cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1143cd620004SJunchao Zhang {
1144cd620004SJunchao Zhang   PetscFunctionBegin;
1145cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1146cd620004SJunchao Zhang   {
1147cd620004SJunchao Zhang     PetscErrorCode ierr;
1148cd620004SJunchao Zhang     PetscInt       i;
1149cd620004SJunchao Zhang     PetscMPIInt    n;
1150cd620004SJunchao Zhang     if (indices) {
1151cd620004SJunchao Zhang       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1152cd620004SJunchao Zhang          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1153cd620004SJunchao Zhang       */
1154ffc4695bSBarry Smith       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);}
1155cd620004SJunchao Zhang     } else {
1156cd620004SJunchao Zhang       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1157ffc4695bSBarry Smith       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr);
1158cd620004SJunchao Zhang     }
1159cd620004SJunchao Zhang   }
1160b458e8f1SJose E. Roman   PetscFunctionReturn(0);
1161cd620004SJunchao Zhang #else
1162cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1163cd620004SJunchao Zhang #endif
1164cd620004SJunchao Zhang }
1165cd620004SJunchao Zhang 
1166fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1167cd620004SJunchao Zhang {
1168cd620004SJunchao Zhang   PetscFunctionBegin;
1169cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1170cd620004SJunchao Zhang   {
1171cd620004SJunchao Zhang     PetscErrorCode ierr;
1172cd620004SJunchao Zhang     PetscInt       i,disp;
1173fcc7397dSJunchao Zhang     if (!srcIdx) {
1174fcc7397dSJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
1175fcc7397dSJunchao Zhang     } else {
1176cd620004SJunchao Zhang       for (i=0; i<count; i++) {
1177fcc7397dSJunchao Zhang         disp = dstIdx? dstIdx[i] : dstStart + i;
1178ffc4695bSBarry Smith         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);
1179fcc7397dSJunchao Zhang       }
1180cd620004SJunchao Zhang     }
1181cd620004SJunchao Zhang   }
1182b458e8f1SJose E. Roman   PetscFunctionReturn(0);
1183cd620004SJunchao Zhang #else
1184cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1185cd620004SJunchao Zhang #endif
1186cd620004SJunchao Zhang }
1187cd620004SJunchao Zhang 
1188cd620004SJunchao Zhang /*=============================================================================
1189cd620004SJunchao Zhang               Pack/Unpack/Fetch/Scatter routines
1190cd620004SJunchao Zhang  ============================================================================*/
1191cd620004SJunchao Zhang 
1192cd620004SJunchao Zhang /* Pack rootdata to rootbuf
1193cd620004SJunchao Zhang   Input Arguments:
1194cd620004SJunchao Zhang   + sf       - The SF this packing works on.
1195cd620004SJunchao Zhang   . link     - It gives the memtype of the roots and also provides root buffer.
1196cd620004SJunchao Zhang   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1197cd620004SJunchao Zhang   - rootdata - Where to read the roots.
1198cd620004SJunchao Zhang 
1199cd620004SJunchao Zhang   Notes:
1200cd620004SJunchao Zhang   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1201cd620004SJunchao Zhang   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1202cd620004SJunchao Zhang  */
1203cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1204cd620004SJunchao Zhang {
1205cd620004SJunchao Zhang   PetscErrorCode   ierr;
1206cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1207cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1208cd620004SJunchao Zhang   PetscInt         count,start;
1209fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1210cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1211fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1212fcc7397dSJunchao Zhang 
1213cd620004SJunchao Zhang   PetscFunctionBegin;
1214cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1215cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1216cd620004SJunchao Zhang   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1217fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1218cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1219fcc7397dSJunchao Zhang     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1220cd620004SJunchao Zhang   }
1221cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1222cd620004SJunchao Zhang     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1223cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1224cd620004SJunchao Zhang   }
1225cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1226cd620004SJunchao Zhang   PetscFunctionReturn(0);
1227cd620004SJunchao Zhang }
1228cd620004SJunchao Zhang 
1229cd620004SJunchao Zhang /* Pack leafdata to leafbuf */
1230cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1231cd620004SJunchao Zhang {
1232cd620004SJunchao Zhang   PetscErrorCode   ierr;
1233cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1234cd620004SJunchao Zhang   PetscInt         count,start;
1235fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1236cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1237fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1238cd620004SJunchao Zhang 
1239cd620004SJunchao Zhang   PetscFunctionBegin;
1240cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1241cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1242cd620004SJunchao Zhang   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1243fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1244cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1245fcc7397dSJunchao Zhang     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1246cd620004SJunchao Zhang   }
1247cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1248cd620004SJunchao Zhang     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1249cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1250cd620004SJunchao Zhang   }
1251cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1252cd620004SJunchao Zhang   PetscFunctionReturn(0);
1253cd620004SJunchao Zhang }
1254cd620004SJunchao Zhang 
1255cd620004SJunchao Zhang /* Unpack rootbuf to rootdata */
1256cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1257cd620004SJunchao Zhang {
1258cd620004SJunchao Zhang   PetscErrorCode   ierr;
1259cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1260cd620004SJunchao Zhang   PetscInt         count,start;
1261cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1262fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1263cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1264fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1265cd620004SJunchao Zhang 
1266cd620004SJunchao Zhang   PetscFunctionBegin;
1267cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1268cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1269cd620004SJunchao Zhang   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1270cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1271cd620004SJunchao Zhang     if (UnpackAndOp) {
1272fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1273fcc7397dSJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1274cd620004SJunchao Zhang     } else {
1275fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1276cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1277cd620004SJunchao Zhang     }
1278cd620004SJunchao Zhang   }
1279cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1280cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1281cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1282cd620004SJunchao Zhang   PetscFunctionReturn(0);
1283cd620004SJunchao Zhang }
1284cd620004SJunchao Zhang 
1285cd620004SJunchao Zhang /* Unpack leafbuf to leafdata */
1286cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1287cd620004SJunchao Zhang {
1288cd620004SJunchao Zhang   PetscErrorCode   ierr;
1289cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1290cd620004SJunchao Zhang   PetscInt         count,start;
1291fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1292cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1293fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1294cd620004SJunchao Zhang 
1295cd620004SJunchao Zhang   PetscFunctionBegin;
1296cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1297cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1298cd620004SJunchao Zhang   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1299cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1300cd620004SJunchao Zhang     if (UnpackAndOp) {
1301fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1302fcc7397dSJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1303cd620004SJunchao Zhang     } else {
1304fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1305cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1306cd620004SJunchao Zhang     }
1307cd620004SJunchao Zhang   }
1308cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1309cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1310cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1311cd620004SJunchao Zhang   PetscFunctionReturn(0);
1312cd620004SJunchao Zhang }
1313cd620004SJunchao Zhang 
1314cd620004SJunchao Zhang /* FetchAndOp rootdata with rootbuf */
1315cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1316cd620004SJunchao Zhang {
1317cd620004SJunchao Zhang   PetscErrorCode     ierr;
1318cd620004SJunchao Zhang   const PetscInt     *rootindices = NULL;
1319cd620004SJunchao Zhang   PetscInt           count,start;
1320cd620004SJunchao Zhang   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1321fcc7397dSJunchao Zhang   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1322cd620004SJunchao Zhang   PetscMemType       rootmtype = link->rootmtype;
1323fcc7397dSJunchao Zhang   PetscSFPackOpt     opt = NULL;
1324cd620004SJunchao Zhang 
1325cd620004SJunchao Zhang   PetscFunctionBegin;
1326cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1327cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1328cd620004SJunchao Zhang   if (bas->rootbuflen[scope]) {
1329cd620004SJunchao Zhang     /* Do FetchAndOp on rootdata with rootbuf */
1330cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1331fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1332fcc7397dSJunchao Zhang     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1333cd620004SJunchao Zhang   }
1334cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1335cd620004SJunchao Zhang     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1336cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1337cd620004SJunchao Zhang   }
1338cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1339cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1340cd620004SJunchao Zhang   PetscFunctionReturn(0);
1341cd620004SJunchao Zhang }
1342cd620004SJunchao Zhang 
1343cd620004SJunchao Zhang /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1344cd620004SJunchao Zhang PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1345cd620004SJunchao Zhang {
1346cd620004SJunchao Zhang   PetscErrorCode       ierr;
1347cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1348cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1349cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1350fcc7397dSJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1351cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1352fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1353cd620004SJunchao Zhang 
1354cd620004SJunchao Zhang   PetscFunctionBegin;
1355cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1356cd620004SJunchao Zhang   if (rootmtype != leafmtype) { /* Uncommon case */
1357cd620004SJunchao Zhang      /* The local communication has to go through pack and unpack */
1358cd620004SJunchao Zhang     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
135920c24465SJunchao Zhang     ierr = (*link->Memcpy)(link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1360cd620004SJunchao Zhang     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1361cd620004SJunchao Zhang   } else {
1362cd620004SJunchao Zhang     ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1363cd620004SJunchao Zhang     if (ScatterAndOp) {
1364fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1365fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1366fcc7397dSJunchao Zhang       ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1367cd620004SJunchao Zhang     } else {
1368fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1369fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1370fcc7397dSJunchao Zhang       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1371cd620004SJunchao Zhang     }
1372cd620004SJunchao Zhang   }
1373cd620004SJunchao Zhang   PetscFunctionReturn(0);
1374cd620004SJunchao Zhang }
1375cd620004SJunchao Zhang 
1376cd620004SJunchao Zhang /* Reduce leafdata to rootdata locally */
1377cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1378cd620004SJunchao Zhang {
1379cd620004SJunchao Zhang   PetscErrorCode       ierr;
1380cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1381cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1382cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1383fcc7397dSJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1384cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1385fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1386cd620004SJunchao Zhang 
1387cd620004SJunchao Zhang   PetscFunctionBegin;
1388cd620004SJunchao Zhang   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1389cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1390cd620004SJunchao Zhang     /* The local communication has to go through pack and unpack */
1391cd620004SJunchao Zhang     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
139220c24465SJunchao Zhang     ierr = (*link->Memcpy)(link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1393cd620004SJunchao Zhang     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1394cd620004SJunchao Zhang   } else {
1395cd620004SJunchao Zhang     ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1396cd620004SJunchao Zhang     if (ScatterAndOp) {
1397fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1398fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1399fcc7397dSJunchao Zhang       ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1400cd620004SJunchao Zhang     } else {
1401fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1402fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1403fcc7397dSJunchao Zhang       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1404cd620004SJunchao Zhang     }
1405cd620004SJunchao Zhang   }
1406cd620004SJunchao Zhang   PetscFunctionReturn(0);
1407cd620004SJunchao Zhang }
1408cd620004SJunchao Zhang 
1409cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */
1410cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1411cd620004SJunchao Zhang {
1412cd620004SJunchao Zhang   PetscErrorCode       ierr;
1413cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1414cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1415cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1416fcc7397dSJunchao Zhang   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1417cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1418fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1419cd620004SJunchao Zhang 
1420cd620004SJunchao Zhang   PetscFunctionBegin;
1421cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1422cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1423cd620004SJunchao Zhang    /* The local communication has to go through pack and unpack */
1424cd620004SJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1425cd620004SJunchao Zhang   } else {
1426fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1427fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1428cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1429fcc7397dSJunchao Zhang     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1430cd620004SJunchao Zhang   }
143140e23c03SJunchao Zhang   PetscFunctionReturn(0);
143240e23c03SJunchao Zhang }
143340e23c03SJunchao Zhang 
143440e23c03SJunchao Zhang /*
1435cd620004SJunchao Zhang   Create per-rank pack/unpack optimizations based on indice patterns
143640e23c03SJunchao Zhang 
143740e23c03SJunchao Zhang    Input Parameters:
1438fcc7397dSJunchao Zhang   +  n       - Number of destination ranks
1439eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1440b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
144140e23c03SJunchao Zhang 
144240e23c03SJunchao Zhang    Output Parameters:
1443cd620004SJunchao Zhang   +  opt     - Pack optimizations. NULL if no optimizations.
144440e23c03SJunchao Zhang */
1445cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
144640e23c03SJunchao Zhang {
144740e23c03SJunchao Zhang   PetscErrorCode ierr;
1448fcc7397dSJunchao Zhang   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1449fcc7397dSJunchao Zhang   PetscBool      optimizable = PETSC_TRUE;
145040e23c03SJunchao Zhang   PetscSFPackOpt opt;
145140e23c03SJunchao Zhang 
145240e23c03SJunchao Zhang   PetscFunctionBegin;
1453fcc7397dSJunchao Zhang   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1454fcc7397dSJunchao Zhang   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1455fcc7397dSJunchao Zhang   opt->n      = opt->array[0] = n;
1456fcc7397dSJunchao Zhang   opt->offset = opt->array + 1;
1457fcc7397dSJunchao Zhang   opt->start  = opt->array + n   + 2;
1458fcc7397dSJunchao Zhang   opt->dx     = opt->array + 2*n + 2;
1459fcc7397dSJunchao Zhang   opt->dy     = opt->array + 3*n + 2;
1460fcc7397dSJunchao Zhang   opt->dz     = opt->array + 4*n + 2;
1461fcc7397dSJunchao Zhang   opt->X      = opt->array + 5*n + 2;
1462fcc7397dSJunchao Zhang   opt->Y      = opt->array + 6*n + 2;
1463fcc7397dSJunchao Zhang 
1464fcc7397dSJunchao Zhang   for (r=0; r<n; r++) { /* For each destination rank */
1465fcc7397dSJunchao Zhang     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1466fcc7397dSJunchao Zhang     p     = offset[r];
1467fcc7397dSJunchao Zhang     start = idx[p]; /* First index for this rank */
1468fcc7397dSJunchao Zhang     p++;
1469fcc7397dSJunchao Zhang 
1470fcc7397dSJunchao Zhang     /* Search in X dimension */
1471fcc7397dSJunchao Zhang     for (dx=1; dx<m; dx++,p++) {
1472fcc7397dSJunchao Zhang       if (start+dx != idx[p]) break;
1473b23bfdefSJunchao Zhang     }
1474b23bfdefSJunchao Zhang 
1475fcc7397dSJunchao Zhang     dydz = m/dx;
1476fcc7397dSJunchao Zhang     X    = dydz > 1 ? (idx[p]-start) : dx;
1477fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1478fcc7397dSJunchao Zhang     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1479fcc7397dSJunchao Zhang     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1480fcc7397dSJunchao Zhang       for (i=0; i<dx; i++,p++) {
1481fcc7397dSJunchao Zhang         if (start+X*dy+i != idx[p]) {
1482fcc7397dSJunchao Zhang           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1483fcc7397dSJunchao Zhang           else goto Z_dimension;
148440e23c03SJunchao Zhang         }
148540e23c03SJunchao Zhang       }
148640e23c03SJunchao Zhang     }
148740e23c03SJunchao Zhang 
1488fcc7397dSJunchao Zhang Z_dimension:
1489fcc7397dSJunchao Zhang     dz = m/(dx*dy);
1490fcc7397dSJunchao Zhang     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1491fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1492fcc7397dSJunchao Zhang     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1493fcc7397dSJunchao Zhang     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1494fcc7397dSJunchao Zhang       for (j=0; j<dy; j++) {
1495fcc7397dSJunchao Zhang         for (i=0; i<dx; i++,p++) {
1496fcc7397dSJunchao Zhang           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
149740e23c03SJunchao Zhang         }
149840e23c03SJunchao Zhang       }
149940e23c03SJunchao Zhang     }
1500fcc7397dSJunchao Zhang     opt->start[r] = start;
1501fcc7397dSJunchao Zhang     opt->dx[r]    = dx;
1502fcc7397dSJunchao Zhang     opt->dy[r]    = dy;
1503fcc7397dSJunchao Zhang     opt->dz[r]    = dz;
1504fcc7397dSJunchao Zhang     opt->X[r]     = X;
1505fcc7397dSJunchao Zhang     opt->Y[r]     = Y;
150640e23c03SJunchao Zhang   }
150740e23c03SJunchao Zhang 
1508fcc7397dSJunchao Zhang finish:
1509fcc7397dSJunchao Zhang   /* If not optimizable, free arrays to save memory */
1510fcc7397dSJunchao Zhang   if (!n || !optimizable) {
1511fcc7397dSJunchao Zhang     ierr = PetscFree(opt->array);CHKERRQ(ierr);
151240e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
151340e23c03SJunchao Zhang     *out = NULL;
1514fcc7397dSJunchao Zhang   } else {
1515fcc7397dSJunchao Zhang     opt->offset[0] = 0;
1516fcc7397dSJunchao Zhang     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1517fcc7397dSJunchao Zhang     *out = opt;
1518fcc7397dSJunchao Zhang   }
151940e23c03SJunchao Zhang   PetscFunctionReturn(0);
152040e23c03SJunchao Zhang }
152140e23c03SJunchao Zhang 
152220c24465SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
152340e23c03SJunchao Zhang {
152440e23c03SJunchao Zhang   PetscErrorCode ierr;
152540e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
152640e23c03SJunchao Zhang 
152740e23c03SJunchao Zhang   PetscFunctionBegin;
152840e23c03SJunchao Zhang   if (opt) {
152920c24465SJunchao Zhang     ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr);
153040e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
153140e23c03SJunchao Zhang     *out = NULL;
153240e23c03SJunchao Zhang   }
153340e23c03SJunchao Zhang   PetscFunctionReturn(0);
153440e23c03SJunchao Zhang }
1535cd620004SJunchao Zhang 
1536cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1537cd620004SJunchao Zhang {
1538cd620004SJunchao Zhang   PetscErrorCode ierr;
1539cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1540cd620004SJunchao Zhang   PetscInt       i,j;
1541cd620004SJunchao Zhang 
1542cd620004SJunchao Zhang   PetscFunctionBegin;
1543cd620004SJunchao Zhang   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1544cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Set defaults */
1545cd620004SJunchao Zhang     sf->leafstart[i]   = 0;
1546cd620004SJunchao Zhang     sf->leafcontig[i]  = PETSC_TRUE;
1547cd620004SJunchao Zhang     sf->leafdups[i]    = PETSC_FALSE;
1548cd620004SJunchao Zhang     bas->rootstart[i]  = 0;
1549cd620004SJunchao Zhang     bas->rootcontig[i] = PETSC_TRUE;
1550cd620004SJunchao Zhang     bas->rootdups[i]   = PETSC_FALSE;
1551cd620004SJunchao Zhang   }
1552cd620004SJunchao Zhang 
1553cd620004SJunchao Zhang   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1554cd620004SJunchao Zhang   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1555cd620004SJunchao Zhang 
1556cd620004SJunchao Zhang   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1557cd620004SJunchao Zhang   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1558cd620004SJunchao Zhang 
1559cd620004SJunchao Zhang   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1560cd620004SJunchao Zhang   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1561cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1562cd620004SJunchao Zhang   }
1563cd620004SJunchao Zhang   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1564cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1565cd620004SJunchao Zhang   }
1566cd620004SJunchao Zhang 
1567cd620004SJunchao Zhang   /* If not, see if we can have per-rank optimizations by doing index analysis */
1568cd620004SJunchao Zhang   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1569cd620004SJunchao Zhang   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1570cd620004SJunchao Zhang 
1571cd620004SJunchao Zhang   /* Are root indices for self and remote contiguous? */
1572cd620004SJunchao Zhang   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1573cd620004SJunchao Zhang   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1574cd620004SJunchao Zhang 
1575cd620004SJunchao Zhang   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1576cd620004SJunchao Zhang   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1577cd620004SJunchao Zhang 
1578cd620004SJunchao Zhang   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1579cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1580cd620004SJunchao Zhang   }
1581cd620004SJunchao Zhang   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1582cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1583cd620004SJunchao Zhang   }
1584cd620004SJunchao Zhang 
1585cd620004SJunchao Zhang   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1586cd620004SJunchao Zhang   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1587cd620004SJunchao Zhang 
15887fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1589cd620004SJunchao Zhang     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1590013b3241SStefano Zampini   if (PetscDefined(HAVE_DEVICE)) {
1591013b3241SStefano Zampini     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1592013b3241SStefano Zampini     if (!sf->leafcontig[0]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1593013b3241SStefano Zampini     if (!sf->leafcontig[1]  && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1594013b3241SStefano Zampini     if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1595013b3241SStefano Zampini     if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1596013b3241SStefano Zampini   }
1597cd620004SJunchao Zhang #endif
1598cd620004SJunchao Zhang 
1599cd620004SJunchao Zhang   PetscFunctionReturn(0);
1600cd620004SJunchao Zhang }
1601cd620004SJunchao Zhang 
1602cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1603cd620004SJunchao Zhang {
1604cd620004SJunchao Zhang   PetscErrorCode ierr;
1605cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1606cd620004SJunchao Zhang   PetscInt       i;
1607cd620004SJunchao Zhang 
1608cd620004SJunchao Zhang   PetscFunctionBegin;
1609cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
161020c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
161120c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
16127fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
161320c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
161420c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
16157fd2d3dbSJunchao Zhang #endif
1616cd620004SJunchao Zhang   }
1617cd620004SJunchao Zhang   PetscFunctionReturn(0);
1618cd620004SJunchao Zhang }
1619