xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 7fd2d3dbf8105d3f2e002a5ca11f019cd0ad7420)
140e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
340e23c03SJunchao Zhang 
4*7fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
5*7fd2d3dbSJunchao Zhang 
6eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
7eb02082bSJunchao Zhang #include <cuda_runtime.h>
8*7fd2d3dbSJunchao Zhang #include <petsccublas.h>
9eb02082bSJunchao Zhang #endif
1040e23c03SJunchao Zhang /*
1140e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
1240e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1340e23c03SJunchao Zhang  */
1440e23c03SJunchao Zhang 
15cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
1640e23c03SJunchao Zhang 
17cd620004SJunchao Zhang /* Operations working like s += t */
18cd620004SJunchao Zhang #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
19cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
20cd620004SJunchao Zhang #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
21cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
22cd620004SJunchao Zhang /* Ref MPI MAXLOC */
23cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \
24cd620004SJunchao Zhang   do {                                       \
25cd620004SJunchao Zhang     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
26cd620004SJunchao Zhang     else if (!((s).u op (t).u)) s = t;           \
27cd620004SJunchao Zhang   } while (0)
2840e23c03SJunchao Zhang 
2940e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
3040e23c03SJunchao Zhang 
3140e23c03SJunchao Zhang    Arguments of the macro:
32b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
33fcc7397dSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bsz.
34b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
3540e23c03SJunchao Zhang 
3640e23c03SJunchao Zhang    Arguments of the Pack routine:
37cd620004SJunchao Zhang    +count     Number of indices in idx[].
38fcc7397dSJunchao Zhang    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
39fcc7397dSJunchao Zhang    .opt       Per-pack optimization plan. NULL means no such plan.
40fcc7397dSJunchao Zhang    .idx       Indices of entries to packed.
41eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
42cd620004SJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
43cd620004SJunchao Zhang    -packed    Address of the packed data.
4440e23c03SJunchao Zhang  */
45b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \
46fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
47b23bfdefSJunchao Zhang   {                                                                                                          \
4840e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
49b23bfdefSJunchao Zhang     const Type     *u = (const Type*)unpacked,*u2;                                                           \
50b23bfdefSJunchao Zhang     Type           *p = (Type*)packed,*p2;                                                                   \
51fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
52fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
53b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
5440e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
55fcc7397dSJunchao Zhang     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
56fcc7397dSJunchao Zhang     else if (opt) { /* has optimizations available */                                                        \
57fcc7397dSJunchao Zhang       p2 = p;                                                                                                \
58fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
59fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
60fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
61fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
62fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
63fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
64fcc7397dSJunchao Zhang             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
65fcc7397dSJunchao Zhang             p2  += opt->dx[r]*MBS;                                                                           \
66fcc7397dSJunchao Zhang           }                                                                                                  \
67fcc7397dSJunchao Zhang       }                                                                                                      \
68fcc7397dSJunchao Zhang     } else {                                                                                                 \
69b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
70eb02082bSJunchao Zhang         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
71eb02082bSJunchao Zhang           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
72b23bfdefSJunchao Zhang             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
7340e23c03SJunchao Zhang     }                                                                                                        \
7440e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
7540e23c03SJunchao Zhang   }
7640e23c03SJunchao Zhang 
77cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
78cd620004SJunchao Zhang                 and inserts into a sparse array.
7940e23c03SJunchao Zhang 
8040e23c03SJunchao Zhang    Arguments:
81b23bfdefSJunchao Zhang   .Type       Type of the data
8240e23c03SJunchao Zhang   .BS         Block size for vectorization
83b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
8440e23c03SJunchao Zhang 
8540e23c03SJunchao Zhang   Notes:
8640e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
8740e23c03SJunchao Zhang  */
88cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ)               \
89fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
90b23bfdefSJunchao Zhang   {                                                                                                          \
9140e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
92b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
93fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
94fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
95fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
96b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
9740e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
98b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
99fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
100fcc7397dSJunchao Zhang       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
101fcc7397dSJunchao Zhang     } else if (opt) { /* has optimizations available */                                                      \
102fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
103fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
104fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
105fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
106fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
107fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
108fcc7397dSJunchao Zhang             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
109fcc7397dSJunchao Zhang             p   += opt->dx[r]*MBS;                                                                           \
110fcc7397dSJunchao Zhang           }                                                                                                  \
111fcc7397dSJunchao Zhang       }                                                                                                      \
112fcc7397dSJunchao Zhang     } else {                                                                                                 \
113b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
114b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
115cd620004SJunchao Zhang           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
11640e23c03SJunchao Zhang     }                                                                                                        \
11740e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
11840e23c03SJunchao Zhang   }
11940e23c03SJunchao Zhang 
120cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
12140e23c03SJunchao Zhang 
12240e23c03SJunchao Zhang    Arguments:
123cd620004SJunchao Zhang   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
124b23bfdefSJunchao Zhang   .Type       Type of the data
12540e23c03SJunchao Zhang   .BS         Block size for vectorization
126b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
127cd620004SJunchao Zhang   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
128cd620004SJunchao Zhang   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
12940e23c03SJunchao Zhang  */
130cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
131fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
132b23bfdefSJunchao Zhang   {                                                                                                          \
133cd620004SJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
134fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
135fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
136fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
137b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
13840e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
139b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
140fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
141cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
142cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
143cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
144cd620004SJunchao Zhang             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
145fcc7397dSJunchao Zhang     } else if (opt) { /* idx[] has patterns */                                                               \
146fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
147fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
148fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
149fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
150fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
151fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
152fcc7397dSJunchao Zhang             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
153fcc7397dSJunchao Zhang             p += opt->dx[r]*MBS;                                                                             \
154fcc7397dSJunchao Zhang           }                                                                                                  \
155fcc7397dSJunchao Zhang       }                                                                                                      \
156fcc7397dSJunchao Zhang     } else {                                                                                                 \
157cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
158cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
159cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
160cd620004SJunchao Zhang             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
161cd620004SJunchao Zhang     }                                                                                                        \
162cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
163cd620004SJunchao Zhang   }
164cd620004SJunchao Zhang 
165cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
166fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
167cd620004SJunchao Zhang   {                                                                                                          \
168fcc7397dSJunchao Zhang     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
169fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
170fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
171fcc7397dSJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
172cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
173fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
174fcc7397dSJunchao Zhang       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
175fcc7397dSJunchao Zhang       l = i*MBS;                                                                                             \
176b23bfdefSJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
177b23bfdefSJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
178fcc7397dSJunchao Zhang           tmp = u[r+j*BS+k];                                                                                 \
179fcc7397dSJunchao Zhang           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
180fcc7397dSJunchao Zhang           p[l+j*BS+k] = tmp;                                                                                 \
181cd620004SJunchao Zhang         }                                                                                                    \
182cd620004SJunchao Zhang     }                                                                                                        \
183cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
184cd620004SJunchao Zhang   }
185cd620004SJunchao Zhang 
186cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
187fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
188cd620004SJunchao Zhang   {                                                                                                          \
189fcc7397dSJunchao Zhang     PetscErrorCode ierr;                                                                                     \
190fcc7397dSJunchao Zhang     const Type     *u = (const Type*)src;                                                                    \
191fcc7397dSJunchao Zhang     Type           *v = (Type*)dst;                                                                          \
192fcc7397dSJunchao Zhang     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
193cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
194cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
195cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
196fcc7397dSJunchao Zhang     if (!srcIdx) { /* src is contiguous */                                                                   \
197fcc7397dSJunchao Zhang       u += srcStart*MBS;                                                                                     \
198fcc7397dSJunchao Zhang       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
199fcc7397dSJunchao Zhang     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
200fcc7397dSJunchao Zhang       u += srcOpt->start[0]*MBS;                                                                             \
201fcc7397dSJunchao Zhang       v += dstStart*MBS;                                                                                     \
202fcc7397dSJunchao Zhang       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
203fcc7397dSJunchao Zhang       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
204fcc7397dSJunchao Zhang         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
205fcc7397dSJunchao Zhang           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
206fcc7397dSJunchao Zhang           v += srcOpt->dx[0]*MBS;                                                                            \
207fcc7397dSJunchao Zhang         }                                                                                                    \
208fcc7397dSJunchao Zhang     } else { /* all other cases */                                                                           \
209fcc7397dSJunchao Zhang       for (i=0; i<count; i++) {                                                                              \
210fcc7397dSJunchao Zhang         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
211fcc7397dSJunchao Zhang         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
212cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
213fcc7397dSJunchao Zhang           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
214fcc7397dSJunchao Zhang       }                                                                                                      \
215cd620004SJunchao Zhang     }                                                                                                        \
216cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
217cd620004SJunchao Zhang   }
218cd620004SJunchao Zhang 
219cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
220fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
221cd620004SJunchao Zhang   {                                                                                                          \
222fcc7397dSJunchao Zhang     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
223fcc7397dSJunchao Zhang     const Type     *ldata = (const Type*)leafdata;                                                           \
224fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
225cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
226cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
227cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
228fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
229fcc7397dSJunchao Zhang       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
230fcc7397dSJunchao Zhang       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
231cd620004SJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
232cd620004SJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
233fcc7397dSJunchao Zhang           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
234fcc7397dSJunchao Zhang           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
23540e23c03SJunchao Zhang         }                                                                                                    \
23640e23c03SJunchao Zhang     }                                                                                                        \
23740e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
23840e23c03SJunchao Zhang   }
23940e23c03SJunchao Zhang 
240b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
241b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ)                                                      \
242b23bfdefSJunchao Zhang   DEF_PackFunc(Type,BS,EQ)                                                        \
243cd620004SJunchao Zhang   DEF_UnpackFunc(Type,BS,EQ)                                                      \
244cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
245cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
246eb02082bSJunchao Zhang     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
247eb02082bSJunchao Zhang     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
248cd620004SJunchao Zhang     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
24940e23c03SJunchao Zhang   }
25040e23c03SJunchao Zhang 
251b23bfdefSJunchao Zhang /* Add, Mult ops */
252b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ)                                                       \
253cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
254cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
255cd620004SJunchao Zhang   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
256cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
257cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
258cd620004SJunchao Zhang   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
259cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
260eb02082bSJunchao Zhang     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
261eb02082bSJunchao Zhang     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
262eb02082bSJunchao Zhang     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
263cd620004SJunchao Zhang     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
264cd620004SJunchao Zhang     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
265cd620004SJunchao Zhang     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
26640e23c03SJunchao Zhang   }
26740e23c03SJunchao Zhang 
268b23bfdefSJunchao Zhang /* Max, Min ops */
269b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ)                                                       \
270cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
271cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
272cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
273cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
274cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
275eb02082bSJunchao Zhang     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
276eb02082bSJunchao Zhang     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
277cd620004SJunchao Zhang     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
278cd620004SJunchao Zhang     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
279b23bfdefSJunchao Zhang   }
280b23bfdefSJunchao Zhang 
281b23bfdefSJunchao Zhang /* Logical ops.
282cd620004SJunchao Zhang   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
28340e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
28440e23c03SJunchao Zhang  */
285b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ)                                                       \
286cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
287cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
288cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
289cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
290cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
291cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
292cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
293eb02082bSJunchao Zhang     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
294eb02082bSJunchao Zhang     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
295eb02082bSJunchao Zhang     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
296cd620004SJunchao Zhang     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
297cd620004SJunchao Zhang     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
298cd620004SJunchao Zhang     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
29940e23c03SJunchao Zhang   }
30040e23c03SJunchao Zhang 
301b23bfdefSJunchao Zhang /* Bitwise ops */
302b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ)                                                       \
303cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
304cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
305cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
306cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
307cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
308cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
309cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
310eb02082bSJunchao Zhang     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
311eb02082bSJunchao Zhang     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
312eb02082bSJunchao Zhang     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
313cd620004SJunchao Zhang     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
314cd620004SJunchao Zhang     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
315cd620004SJunchao Zhang     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
31640e23c03SJunchao Zhang   }
31740e23c03SJunchao Zhang 
318cd620004SJunchao Zhang /* Maxloc, Minloc ops */
319cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ)                                                      \
320cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
321cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
322cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
323cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
324cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
325cd620004SJunchao Zhang     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
326cd620004SJunchao Zhang     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
327cd620004SJunchao Zhang     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
328cd620004SJunchao Zhang     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
32940e23c03SJunchao Zhang   }
33040e23c03SJunchao Zhang 
331b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ)                                               \
332b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
333b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
334b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
335b23bfdefSJunchao Zhang   DEF_Log(Type,BS,EQ)                                                             \
336b23bfdefSJunchao Zhang   DEF_Bit(Type,BS,EQ)                                                             \
337cd620004SJunchao Zhang   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
338b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
339b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
340b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
341b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
342b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
34340e23c03SJunchao Zhang   }
34440e23c03SJunchao Zhang 
345b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ)                                                  \
346b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
347b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
348b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
349cd620004SJunchao Zhang   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
350b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
351b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
352b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
353b23bfdefSJunchao Zhang   }
35440e23c03SJunchao Zhang 
35540e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
356b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ)                                               \
357b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
358b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
359cd620004SJunchao Zhang   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
360b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
361b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
362b23bfdefSJunchao Zhang   }
36340e23c03SJunchao Zhang #endif
364b23bfdefSJunchao Zhang 
365b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ)                                                  \
366b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
367cd620004SJunchao Zhang   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
368b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
369b23bfdefSJunchao Zhang   }
370b23bfdefSJunchao Zhang 
371b23bfdefSJunchao Zhang /* Maxloc, Minloc */
372cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ)                                                  \
373cd620004SJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
374cd620004SJunchao Zhang   DEF_Xloc(Type,BS,EQ)                                                            \
375cd620004SJunchao Zhang   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
376cd620004SJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
377cd620004SJunchao Zhang     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
378b23bfdefSJunchao Zhang   }
379b23bfdefSJunchao Zhang 
380b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
381b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
382b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
383b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
384b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
385b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
386b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
387b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
388b23bfdefSJunchao Zhang 
389b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
390b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1)
391b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1)
392b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1)
393b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1)
394b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0)
395b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0)
396b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0)
397b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0)
398b23bfdefSJunchao Zhang #endif
399b23bfdefSJunchao Zhang 
400b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */
401b23bfdefSJunchao Zhang typedef signed char SignedChar;
402b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1)
403b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1)
404b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1)
405b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1)
406b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0)
407b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0)
408b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0)
409b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0)
410b23bfdefSJunchao Zhang 
411b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar;
412b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1)
413b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1)
414b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1)
415b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1)
416b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0)
417b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0)
418b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0)
419b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0)
420b23bfdefSJunchao Zhang 
421b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1)
422b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1)
423b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1)
424b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1)
425b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0)
426b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0)
427b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0)
428b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0)
429b23bfdefSJunchao Zhang 
430b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
431b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1)
432b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1)
433b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1)
434b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1)
435b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0)
436b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0)
437b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0)
438b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0)
439b23bfdefSJunchao Zhang #endif
440b23bfdefSJunchao Zhang 
441cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2
442cd620004SJunchao Zhang typedef struct {int u; int i;}           PairType(int,int);
443cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
444cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1)
445cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1)
446b23bfdefSJunchao Zhang 
447b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */
448b23bfdefSJunchao Zhang DEF_DumbType(char,1,1)
449b23bfdefSJunchao Zhang DEF_DumbType(char,2,1)
450b23bfdefSJunchao Zhang DEF_DumbType(char,4,1)
451b23bfdefSJunchao Zhang DEF_DumbType(char,1,0)
452b23bfdefSJunchao Zhang DEF_DumbType(char,2,0)
453b23bfdefSJunchao Zhang DEF_DumbType(char,4,0)
454b23bfdefSJunchao Zhang 
455eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
456b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1)
457b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1)
458b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1)
459b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1)
460b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0)
461b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0)
462b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0)
463b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0)
46440e23c03SJunchao Zhang 
46540e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
46640e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
46740e23c03SJunchao Zhang {
46840e23c03SJunchao Zhang   int ierr;
46940e23c03SJunchao Zhang   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
47040e23c03SJunchao Zhang   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
47140e23c03SJunchao Zhang   return MPI_SUCCESS;
47240e23c03SJunchao Zhang }
47340e23c03SJunchao Zhang #endif
47440e23c03SJunchao Zhang 
475cd620004SJunchao Zhang /*
476cd620004SJunchao Zhang    The routine Creates a communication link for the given operation. It first looks up its link cache. If
477cd620004SJunchao Zhang    there is a free & suitable one, it uses it. Otherwise it creates a new one.
478cd620004SJunchao Zhang 
479cd620004SJunchao Zhang    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
480cd620004SJunchao Zhang    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
481cd620004SJunchao Zhang    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
482cd620004SJunchao Zhang    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
483cd620004SJunchao Zhang 
484cd620004SJunchao Zhang    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
485cd620004SJunchao Zhang 
486cd620004SJunchao Zhang    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
487cd620004SJunchao Zhang 
488cd620004SJunchao Zhang    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
489cd620004SJunchao Zhang    need pack/unpack data.
490cd620004SJunchao Zhang */
491cd620004SJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
49240e23c03SJunchao Zhang {
49340e23c03SJunchao Zhang   PetscErrorCode    ierr;
494cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
495cd620004SJunchao Zhang   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
496cd620004SJunchao Zhang   PetscSFLink       *p,link;
497cd620004SJunchao Zhang   PetscSFDirection  direction;
498cd620004SJunchao Zhang   MPI_Request       *reqs = NULL;
499cd620004SJunchao Zhang   PetscBool         match,rootdirect[2],leafdirect[2];
500cd620004SJunchao Zhang   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
501cd620004SJunchao Zhang   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
502cd620004SJunchao Zhang 
503cd620004SJunchao Zhang   PetscFunctionBegin;
504cd620004SJunchao Zhang   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
505cd620004SJunchao Zhang 
506cd620004SJunchao Zhang   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
507cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
508cd620004SJunchao Zhang     if (sfop == PETSCSF_BCAST) {
509cd620004SJunchao Zhang       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
510cd620004SJunchao Zhang       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
511cd620004SJunchao Zhang     } else if (sfop == PETSCSF_REDUCE) {
512cd620004SJunchao Zhang       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
513cd620004SJunchao Zhang       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
514cd620004SJunchao Zhang     } else { /* PETSCSF_FETCH */
515cd620004SJunchao Zhang       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
516cd620004SJunchao Zhang       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
517cd620004SJunchao Zhang     }
518cd620004SJunchao Zhang   }
519cd620004SJunchao Zhang 
520c2a741eeSJunchao Zhang   if (sf->use_gpu_aware_mpi) {
521cd620004SJunchao Zhang     rootmtype_mpi = rootmtype;
522cd620004SJunchao Zhang     leafmtype_mpi = leafmtype;
523cd620004SJunchao Zhang   } else {
524cd620004SJunchao Zhang     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
525cd620004SJunchao Zhang   }
526cd620004SJunchao Zhang   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
527cd620004SJunchao Zhang   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
528cd620004SJunchao Zhang   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
529cd620004SJunchao Zhang 
530cd620004SJunchao Zhang   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
531cd620004SJunchao Zhang   nrootreqs = bas->nrootreqs;
532cd620004SJunchao Zhang   nleafreqs = sf->nleafreqs;
533cd620004SJunchao Zhang 
534cd620004SJunchao Zhang   /* Look for free links in cache */
535cd620004SJunchao Zhang   for (p=&bas->avail; (link=*p); p=&link->next) {
536cd620004SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
537cd620004SJunchao Zhang     if (match) {
538cd620004SJunchao Zhang       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
539cd620004SJunchao Zhang          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
540cd620004SJunchao Zhang       */
541cd620004SJunchao Zhang       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
542cd620004SJunchao Zhang         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
543cd620004SJunchao Zhang         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
544cd620004SJunchao Zhang         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
545cd620004SJunchao Zhang       }
546cd620004SJunchao Zhang       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
547cd620004SJunchao Zhang         reqs = link->leafreqs[direction][leafmtype][1];
548cd620004SJunchao Zhang         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
549cd620004SJunchao Zhang         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
550cd620004SJunchao Zhang       }
551cd620004SJunchao Zhang       *p = link->next; /* Remove from available list */
552cd620004SJunchao Zhang       goto found;
553cd620004SJunchao Zhang     }
554cd620004SJunchao Zhang   }
555cd620004SJunchao Zhang 
556cd620004SJunchao Zhang   ierr = PetscNew(&link);CHKERRQ(ierr);
557cd620004SJunchao Zhang   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
558cd620004SJunchao Zhang   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
559cd620004SJunchao Zhang 
560cd620004SJunchao Zhang   nreqs = (nrootreqs+nleafreqs)*8;
561cd620004SJunchao Zhang   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
562cd620004SJunchao Zhang   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
563cd620004SJunchao Zhang 
564cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Two communication directions */
565cd620004SJunchao Zhang     for (j=0; j<2; j++) { /* Two memory types */
566cd620004SJunchao Zhang       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
567cd620004SJunchao Zhang         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
568cd620004SJunchao Zhang         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
569cd620004SJunchao Zhang       }
570cd620004SJunchao Zhang     }
571cd620004SJunchao Zhang   }
572cd620004SJunchao Zhang 
573cd620004SJunchao Zhang found:
574*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
575cd620004SJunchao Zhang   if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);}
576*7fd2d3dbSJunchao Zhang #endif
577cd620004SJunchao Zhang 
578cd620004SJunchao Zhang   /* Allocate buffers along root/leafdata */
579cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
580cd620004SJunchao Zhang     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
581cd620004SJunchao Zhang     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
582cd620004SJunchao Zhang     if (bas->rootbuflen[i]) {
583cd620004SJunchao Zhang       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
584cd620004SJunchao Zhang         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
585cd620004SJunchao Zhang       } else { /* Have to have a separate rootbuf */
586cd620004SJunchao Zhang         if (!link->rootbuf_alloc[i][rootmtype]) {
587*7fd2d3dbSJunchao Zhang           ierr = PetscSFMalloc(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
588cd620004SJunchao Zhang         }
589cd620004SJunchao Zhang         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
590cd620004SJunchao Zhang       }
591cd620004SJunchao Zhang     }
592cd620004SJunchao Zhang 
593cd620004SJunchao Zhang     if (sf->leafbuflen[i]) {
594cd620004SJunchao Zhang       if (leafdirect[i]) {
595cd620004SJunchao Zhang         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
596cd620004SJunchao Zhang       } else {
597cd620004SJunchao Zhang         if (!link->leafbuf_alloc[i][leafmtype]) {
598*7fd2d3dbSJunchao Zhang           ierr = PetscSFMalloc(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
599cd620004SJunchao Zhang         }
600cd620004SJunchao Zhang         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
601cd620004SJunchao Zhang       }
602cd620004SJunchao Zhang     }
603cd620004SJunchao Zhang   }
604cd620004SJunchao Zhang 
605*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
606cd620004SJunchao Zhang   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
607cd620004SJunchao Zhang   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
608cd620004SJunchao Zhang     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
609cd620004SJunchao Zhang       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
610cd620004SJunchao Zhang     }
611cd620004SJunchao Zhang     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
612cd620004SJunchao Zhang   }
613cd620004SJunchao Zhang   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
614cd620004SJunchao Zhang     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
615cd620004SJunchao Zhang       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
616cd620004SJunchao Zhang     }
617cd620004SJunchao Zhang     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
618cd620004SJunchao Zhang   }
619*7fd2d3dbSJunchao Zhang #endif
620cd620004SJunchao Zhang 
621cd620004SJunchao Zhang   /* Set `current` state of the link. They may change between different SF invocations with the same link */
622cd620004SJunchao Zhang   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
623cd620004SJunchao Zhang     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
624cd620004SJunchao Zhang     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
625cd620004SJunchao Zhang   }
626cd620004SJunchao Zhang 
627cd620004SJunchao Zhang   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
628cd620004SJunchao Zhang   link->leafdata = leafdata;
629cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
630cd620004SJunchao Zhang     link->rootdirect[i] = rootdirect[i];
631cd620004SJunchao Zhang     link->leafdirect[i] = leafdirect[i];
632cd620004SJunchao Zhang   }
633cd620004SJunchao Zhang   link->rootdirect_mpi  = rootdirect_mpi;
634cd620004SJunchao Zhang   link->leafdirect_mpi  = leafdirect_mpi;
635cd620004SJunchao Zhang   link->rootmtype       = rootmtype;
636cd620004SJunchao Zhang   link->leafmtype       = leafmtype;
637cd620004SJunchao Zhang   link->rootmtype_mpi   = rootmtype_mpi;
638cd620004SJunchao Zhang   link->leafmtype_mpi   = leafmtype_mpi;
639cd620004SJunchao Zhang 
640cd620004SJunchao Zhang   link->next            = bas->inuse;
641cd620004SJunchao Zhang   bas->inuse            = link;
642cd620004SJunchao Zhang   *mylink               = link;
643cd620004SJunchao Zhang   PetscFunctionReturn(0);
644cd620004SJunchao Zhang }
645cd620004SJunchao Zhang 
646cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
647cd620004SJunchao Zhang    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
648cd620004SJunchao Zhang */
649cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
650cd620004SJunchao Zhang {
651cd620004SJunchao Zhang   PetscErrorCode       ierr;
652cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
653cd620004SJunchao Zhang   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
654cd620004SJunchao Zhang   const PetscInt       *rootoffset,*leafoffset;
655cd620004SJunchao Zhang   PetscMPIInt          n;
656cd620004SJunchao Zhang   MPI_Aint             disp;
657cd620004SJunchao Zhang   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
658cd620004SJunchao Zhang   MPI_Datatype         unit = link->unit;
659cd620004SJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
660cd620004SJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
661cd620004SJunchao Zhang 
662cd620004SJunchao Zhang   PetscFunctionBegin;
663cd620004SJunchao Zhang   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
664cd620004SJunchao Zhang   if (sf->persistent) {
665cd620004SJunchao Zhang     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
666cd620004SJunchao Zhang       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
667cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
668cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
669cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
670cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
671cd620004SJunchao Zhang           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
672cd620004SJunchao Zhang         }
673cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
674cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
675cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
676cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
677cd620004SJunchao Zhang           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
678cd620004SJunchao Zhang         }
679cd620004SJunchao Zhang       }
680cd620004SJunchao Zhang       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
681cd620004SJunchao Zhang     }
682cd620004SJunchao Zhang 
683cd620004SJunchao Zhang     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
684cd620004SJunchao Zhang       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
685cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
686cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
687cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
688cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
689cd620004SJunchao Zhang           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
690cd620004SJunchao Zhang         }
691cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
692cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
693cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
694cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
695cd620004SJunchao Zhang           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
696cd620004SJunchao Zhang         }
697cd620004SJunchao Zhang       }
698cd620004SJunchao Zhang       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
699cd620004SJunchao Zhang     }
700cd620004SJunchao Zhang   }
701cd620004SJunchao Zhang   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
702cd620004SJunchao Zhang   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
703cd620004SJunchao Zhang   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
704cd620004SJunchao Zhang   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
705cd620004SJunchao Zhang   PetscFunctionReturn(0);
706cd620004SJunchao Zhang }
707cd620004SJunchao Zhang 
708cd620004SJunchao Zhang 
709cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
710cd620004SJunchao Zhang {
711cd620004SJunchao Zhang   PetscErrorCode    ierr;
712cd620004SJunchao Zhang   PetscSFLink       link,*p;
71340e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
71440e23c03SJunchao Zhang 
71540e23c03SJunchao Zhang   PetscFunctionBegin;
71640e23c03SJunchao Zhang   /* Look for types in cache */
71740e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
71840e23c03SJunchao Zhang     PetscBool match;
71940e23c03SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
720637e6665SJunchao Zhang     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
72140e23c03SJunchao Zhang       switch (cmode) {
72240e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
72340e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
72440e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
72540e23c03SJunchao Zhang       }
72640e23c03SJunchao Zhang       *mylink = link;
72740e23c03SJunchao Zhang       PetscFunctionReturn(0);
72840e23c03SJunchao Zhang     }
72940e23c03SJunchao Zhang   }
73040e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
73140e23c03SJunchao Zhang   PetscFunctionReturn(0);
73240e23c03SJunchao Zhang }
73340e23c03SJunchao Zhang 
734cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
73540e23c03SJunchao Zhang {
73640e23c03SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
73740e23c03SJunchao Zhang 
73840e23c03SJunchao Zhang   PetscFunctionBegin;
739637e6665SJunchao Zhang   (*link)->rootdata = NULL;
740637e6665SJunchao Zhang   (*link)->leafdata = NULL;
74140e23c03SJunchao Zhang   (*link)->next     = bas->avail;
74240e23c03SJunchao Zhang   bas->avail        = *link;
74340e23c03SJunchao Zhang   *link             = NULL;
74440e23c03SJunchao Zhang   PetscFunctionReturn(0);
74540e23c03SJunchao Zhang }
74640e23c03SJunchao Zhang 
747cd620004SJunchao Zhang /* Destroy all links chained in 'avail' */
748cd620004SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
749eb02082bSJunchao Zhang {
750eb02082bSJunchao Zhang   PetscErrorCode    ierr;
751cd620004SJunchao Zhang   PetscSFLink       link = *avail,next;
752cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
753cd620004SJunchao Zhang   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
754eb02082bSJunchao Zhang 
755eb02082bSJunchao Zhang   PetscFunctionBegin;
756eb02082bSJunchao Zhang   for (; link; link=next) {
757eb02082bSJunchao Zhang     next = link->next;
758eb02082bSJunchao Zhang     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
759cd620004SJunchao Zhang     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
760eb02082bSJunchao Zhang       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
761eb02082bSJunchao Zhang     }
762eb02082bSJunchao Zhang     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
763cd620004SJunchao Zhang     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
764cd620004SJunchao Zhang       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
765cd620004SJunchao Zhang       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
766*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
767*7fd2d3dbSJunchao Zhang       ierr = PetscSFFree(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
768*7fd2d3dbSJunchao Zhang       ierr = PetscSFFree(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
769*7fd2d3dbSJunchao Zhang #endif
770cd620004SJunchao Zhang     }
771*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
772*7fd2d3dbSJunchao Zhang     if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);}
773*7fd2d3dbSJunchao Zhang   #if defined(PETSC_HAVE_CUDA)
774*7fd2d3dbSJunchao Zhang     if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;}
775*7fd2d3dbSJunchao Zhang   #elif defined(PETSC_HAVE_HIP)
776*7fd2d3dbSJunchao Zhang     if (link->stream) {hipError_t  cerr = hipStreamDestroy(link->stream);CHKERRQ(cerr); link->stream = NULL;} /* TODO: CHKERRHIP? */
777*7fd2d3dbSJunchao Zhang   #endif
778*7fd2d3dbSJunchao Zhang #endif
779eb02082bSJunchao Zhang     ierr = PetscFree(link);CHKERRQ(ierr);
780eb02082bSJunchao Zhang   }
781eb02082bSJunchao Zhang   *avail = NULL;
782eb02082bSJunchao Zhang   PetscFunctionReturn(0);
783eb02082bSJunchao Zhang }
784eb02082bSJunchao Zhang 
7859d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
786cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
7879d1c8addSJunchao Zhang {
7889d1c8addSJunchao Zhang   PetscErrorCode    ierr;
789cd620004SJunchao Zhang   PetscSFLink       link,*p;
7909d1c8addSJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
7919d1c8addSJunchao Zhang   PetscBool         match;
7929d1c8addSJunchao Zhang 
7939d1c8addSJunchao Zhang   PetscFunctionBegin;
7944e1ad211SJed Brown   if (!PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
79518fb5014SJunchao Zhang   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
79618fb5014SJunchao Zhang      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
79718fb5014SJunchao Zhang   */
798637e6665SJunchao Zhang   if (rootdata || leafdata) {
7999d1c8addSJunchao Zhang     for (p=&bas->inuse; (link=*p); p=&link->next) {
8009d1c8addSJunchao Zhang       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
801cd620004SJunchao Zhang       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
8029d1c8addSJunchao Zhang     }
80318fb5014SJunchao Zhang   }
8049d1c8addSJunchao Zhang   PetscFunctionReturn(0);
8059d1c8addSJunchao Zhang }
8069d1c8addSJunchao Zhang 
807cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
80840e23c03SJunchao Zhang {
80940e23c03SJunchao Zhang   PetscErrorCode ierr;
810b23bfdefSJunchao Zhang   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
811b23bfdefSJunchao Zhang   PetscBool      is2Int,is2PetscInt;
81240e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
81340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
814b23bfdefSJunchao Zhang   PetscInt       nPetscComplex=0;
81540e23c03SJunchao Zhang #endif
81640e23c03SJunchao Zhang 
81740e23c03SJunchao Zhang   PetscFunctionBegin;
818b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
819b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
820b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
821b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
822b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
823b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
82440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
825b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
82640e23c03SJunchao Zhang #endif
82740e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
82840e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
829b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
83040e23c03SJunchao Zhang   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
8315ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
832b23bfdefSJunchao Zhang   link->bs = 1; /* default */
83340e23c03SJunchao Zhang 
834eb02082bSJunchao Zhang   if (is2Int) {
835cd620004SJunchao Zhang     PackInit_PairType_int_int_1_1(link);
836eb02082bSJunchao Zhang     link->bs        = 1;
837eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(int);
8385ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
839eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
8405ad15460SJunchao Zhang     link->unit      = MPI_2INT;
841eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
842cd620004SJunchao Zhang     PackInit_PairType_PetscInt_PetscInt_1_1(link);
843eb02082bSJunchao Zhang     link->bs        = 1;
844eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(PetscInt);
845eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
8465ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
8475ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
848eb02082bSJunchao Zhang   } else if (nPetscReal) {
849b23bfdefSJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
850b23bfdefSJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
851b23bfdefSJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
852b23bfdefSJunchao Zhang     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
853b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
854eb02082bSJunchao Zhang     link->unitbytes = nPetscReal*sizeof(PetscReal);
85540e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
8565ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
857b23bfdefSJunchao Zhang   } else if (nPetscInt) {
858b23bfdefSJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
859b23bfdefSJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
860b23bfdefSJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
861b23bfdefSJunchao Zhang     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
862b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
863eb02082bSJunchao Zhang     link->unitbytes = nPetscInt*sizeof(PetscInt);
864b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
8655ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
866b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
867b23bfdefSJunchao Zhang   } else if (nInt) {
868b23bfdefSJunchao Zhang     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
869b23bfdefSJunchao Zhang     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
870b23bfdefSJunchao Zhang     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
871b23bfdefSJunchao Zhang     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
872b23bfdefSJunchao Zhang     link->bs        = nInt;
873eb02082bSJunchao Zhang     link->unitbytes = nInt*sizeof(int);
874b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
8755ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
876b23bfdefSJunchao Zhang #endif
877b23bfdefSJunchao Zhang   } else if (nSignedChar) {
878b23bfdefSJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
879b23bfdefSJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
880b23bfdefSJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
881b23bfdefSJunchao Zhang     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
882b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
883eb02082bSJunchao Zhang     link->unitbytes = nSignedChar*sizeof(SignedChar);
884b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
8855ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
886b23bfdefSJunchao Zhang   }  else if (nUnsignedChar) {
887b23bfdefSJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
888b23bfdefSJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
889b23bfdefSJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
890b23bfdefSJunchao Zhang     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
891b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
892eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
893b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
8945ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
89540e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
896b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
897b23bfdefSJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
898b23bfdefSJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
899b23bfdefSJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
900b23bfdefSJunchao Zhang     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
901b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
902eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
90340e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
9045ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
90540e23c03SJunchao Zhang #endif
90640e23c03SJunchao Zhang   } else {
907b23bfdefSJunchao Zhang     MPI_Aint lb,nbyte;
908b23bfdefSJunchao Zhang     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
90940e23c03SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
910eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
911eb02082bSJunchao Zhang       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
912eb02082bSJunchao Zhang       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
913eb02082bSJunchao Zhang       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
914eb02082bSJunchao Zhang       link->bs        = nbyte;
915b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
916b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
91740e23c03SJunchao Zhang     } else {
918eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
919eb02082bSJunchao Zhang       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
920eb02082bSJunchao Zhang       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
921eb02082bSJunchao Zhang       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
922eb02082bSJunchao Zhang       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
923eb02082bSJunchao Zhang       link->bs        = nInt;
924b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
92540e23c03SJunchao Zhang       link->basicunit = MPI_INT;
92640e23c03SJunchao Zhang     }
9275ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
92840e23c03SJunchao Zhang   }
929b23bfdefSJunchao Zhang 
9305ad15460SJunchao Zhang   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
93140e23c03SJunchao Zhang   PetscFunctionReturn(0);
93240e23c03SJunchao Zhang }
93340e23c03SJunchao Zhang 
934fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
93540e23c03SJunchao Zhang {
93640e23c03SJunchao Zhang   PetscFunctionBegin;
93740e23c03SJunchao Zhang   *UnpackAndOp = NULL;
938eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
939eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
940eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
941eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
942eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
943eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
944eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
945eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
946eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
947eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
948eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
949eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
950eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
951eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
952eb02082bSJunchao Zhang   }
953*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
954eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
955eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
956eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
957eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
958eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
959eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
960eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
961eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
962eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
963eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
964eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
965eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
966eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
967eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
968eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
969eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
970eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
971eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
972eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
973eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
974eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
975eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
976eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
977eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
978eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
979eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
980eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
981eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
982eb02082bSJunchao Zhang   }
983eb02082bSJunchao Zhang #endif
98440e23c03SJunchao Zhang   PetscFunctionReturn(0);
98540e23c03SJunchao Zhang }
98640e23c03SJunchao Zhang 
987fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
98840e23c03SJunchao Zhang {
98940e23c03SJunchao Zhang   PetscFunctionBegin;
990cd620004SJunchao Zhang   *ScatterAndOp = NULL;
991eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
992cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
993cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
994cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
995cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
996cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
997cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
998cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
999cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
1000cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
1001cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
1002cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
1003cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
1004cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
1005eb02082bSJunchao Zhang   }
1006*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1007eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
1008cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
1009cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
1010cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
1011cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1012cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1013cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1014cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1015cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1016cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1017cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1018cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1019cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1020cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1021eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1022cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1023cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1024cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1025cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1026cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1027cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1028cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1029cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1030cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1031cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1032cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1033cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1034cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1035eb02082bSJunchao Zhang   }
1036eb02082bSJunchao Zhang #endif
1037cd620004SJunchao Zhang   PetscFunctionReturn(0);
1038cd620004SJunchao Zhang }
1039cd620004SJunchao Zhang 
1040fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1041cd620004SJunchao Zhang {
1042cd620004SJunchao Zhang   PetscFunctionBegin;
1043cd620004SJunchao Zhang   *FetchAndOp = NULL;
1044cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1045cd620004SJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1046*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1047cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1048cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1049cd620004SJunchao Zhang #endif
1050cd620004SJunchao Zhang   PetscFunctionReturn(0);
1051cd620004SJunchao Zhang }
1052cd620004SJunchao Zhang 
1053fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1054cd620004SJunchao Zhang {
1055cd620004SJunchao Zhang   PetscFunctionBegin;
1056cd620004SJunchao Zhang   *FetchAndOpLocal = NULL;
1057cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1058cd620004SJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1059*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1060cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1061cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1062cd620004SJunchao Zhang #endif
1063cd620004SJunchao Zhang   PetscFunctionReturn(0);
1064cd620004SJunchao Zhang }
1065cd620004SJunchao Zhang 
1066cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1067cd620004SJunchao Zhang {
1068cd620004SJunchao Zhang   PetscErrorCode ierr;
1069cd620004SJunchao Zhang   PetscLogDouble flops;
1070cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1071cd620004SJunchao Zhang 
1072cd620004SJunchao Zhang 
1073cd620004SJunchao Zhang   PetscFunctionBegin;
1074cd620004SJunchao Zhang   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1075cd620004SJunchao Zhang     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1076*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1077cd620004SJunchao Zhang     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1078cd620004SJunchao Zhang #endif
1079cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1080cd620004SJunchao Zhang   }
1081cd620004SJunchao Zhang   PetscFunctionReturn(0);
1082cd620004SJunchao Zhang }
1083cd620004SJunchao Zhang 
1084cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1085cd620004SJunchao Zhang {
1086cd620004SJunchao Zhang   PetscLogDouble flops;
1087cd620004SJunchao Zhang   PetscErrorCode ierr;
1088cd620004SJunchao Zhang 
1089cd620004SJunchao Zhang   PetscFunctionBegin;
1090cd620004SJunchao Zhang   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1091cd620004SJunchao Zhang     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1092*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1093cd620004SJunchao Zhang     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1094cd620004SJunchao Zhang #endif
1095cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1096cd620004SJunchao Zhang   }
1097cd620004SJunchao Zhang   PetscFunctionReturn(0);
1098cd620004SJunchao Zhang }
1099cd620004SJunchao Zhang 
1100cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1101cd620004SJunchao Zhang   Input Arguments:
1102cd620004SJunchao Zhang   +sf      - The StarForest
1103cd620004SJunchao Zhang   .link    - The link
1104cd620004SJunchao Zhang   .count   - Number of entries to unpack
1105cd620004SJunchao Zhang   .start   - The first index, significent when indices=NULL
1106cd620004SJunchao Zhang   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1107cd620004SJunchao Zhang   .buf     - A contiguous buffer to unpack from
1108cd620004SJunchao Zhang   -op      - Operation after unpack
1109cd620004SJunchao Zhang 
1110cd620004SJunchao Zhang   Output Arguments:
1111cd620004SJunchao Zhang   .data    - The data to unpack to
1112cd620004SJunchao Zhang */
1113cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1114cd620004SJunchao Zhang {
1115cd620004SJunchao Zhang   PetscFunctionBegin;
1116cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1117cd620004SJunchao Zhang   {
1118cd620004SJunchao Zhang     PetscErrorCode ierr;
1119cd620004SJunchao Zhang     PetscInt       i;
1120cd620004SJunchao Zhang     PetscMPIInt    n;
1121cd620004SJunchao Zhang     if (indices) {
1122cd620004SJunchao Zhang       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1123cd620004SJunchao Zhang          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1124cd620004SJunchao Zhang       */
1125cd620004SJunchao Zhang       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
1126cd620004SJunchao Zhang     } else {
1127cd620004SJunchao Zhang       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1128cd620004SJunchao Zhang       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr);
1129cd620004SJunchao Zhang     }
1130cd620004SJunchao Zhang   }
1131cd620004SJunchao Zhang #else
1132cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1133cd620004SJunchao Zhang #endif
1134cd620004SJunchao Zhang   PetscFunctionReturn(0);
1135cd620004SJunchao Zhang }
1136cd620004SJunchao Zhang 
1137fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1138cd620004SJunchao Zhang {
1139cd620004SJunchao Zhang   PetscFunctionBegin;
1140cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1141cd620004SJunchao Zhang   {
1142cd620004SJunchao Zhang     PetscErrorCode ierr;
1143cd620004SJunchao Zhang     PetscInt       i,disp;
1144fcc7397dSJunchao Zhang     if (!srcIdx) {
1145fcc7397dSJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
1146fcc7397dSJunchao Zhang     } else {
1147cd620004SJunchao Zhang       for (i=0; i<count; i++) {
1148fcc7397dSJunchao Zhang         disp = dstIdx? dstIdx[i] : dstStart + i;
1149fcc7397dSJunchao Zhang         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);
1150fcc7397dSJunchao Zhang       }
1151cd620004SJunchao Zhang     }
1152cd620004SJunchao Zhang   }
1153cd620004SJunchao Zhang #else
1154cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1155cd620004SJunchao Zhang #endif
1156cd620004SJunchao Zhang   PetscFunctionReturn(0);
1157cd620004SJunchao Zhang }
1158cd620004SJunchao Zhang 
1159cd620004SJunchao Zhang /*=============================================================================
1160cd620004SJunchao Zhang               Pack/Unpack/Fetch/Scatter routines
1161cd620004SJunchao Zhang  ============================================================================*/
1162cd620004SJunchao Zhang 
1163cd620004SJunchao Zhang /* Pack rootdata to rootbuf
1164cd620004SJunchao Zhang   Input Arguments:
1165cd620004SJunchao Zhang   + sf       - The SF this packing works on.
1166cd620004SJunchao Zhang   . link     - It gives the memtype of the roots and also provides root buffer.
1167cd620004SJunchao Zhang   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1168cd620004SJunchao Zhang   - rootdata - Where to read the roots.
1169cd620004SJunchao Zhang 
1170cd620004SJunchao Zhang   Notes:
1171cd620004SJunchao Zhang   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1172cd620004SJunchao Zhang   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1173cd620004SJunchao Zhang  */
1174cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1175cd620004SJunchao Zhang {
1176cd620004SJunchao Zhang   PetscErrorCode   ierr;
1177cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1178cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1179cd620004SJunchao Zhang   PetscInt         count,start;
1180fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1181cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1182fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1183fcc7397dSJunchao Zhang 
1184cd620004SJunchao Zhang   PetscFunctionBegin;
1185cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1186cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1187cd620004SJunchao Zhang   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1188fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1189cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1190fcc7397dSJunchao Zhang     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1191cd620004SJunchao Zhang   }
1192cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1193cd620004SJunchao Zhang     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1194cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1195cd620004SJunchao Zhang   }
1196cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1197cd620004SJunchao Zhang   PetscFunctionReturn(0);
1198cd620004SJunchao Zhang }
1199cd620004SJunchao Zhang 
1200cd620004SJunchao Zhang /* Pack leafdata to leafbuf */
1201cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1202cd620004SJunchao Zhang {
1203cd620004SJunchao Zhang   PetscErrorCode   ierr;
1204cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1205cd620004SJunchao Zhang   PetscInt         count,start;
1206fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1207cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1208fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1209cd620004SJunchao Zhang 
1210cd620004SJunchao Zhang   PetscFunctionBegin;
1211cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1212cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1213cd620004SJunchao Zhang   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1214fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1215cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1216fcc7397dSJunchao Zhang     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1217cd620004SJunchao Zhang   }
1218cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1219cd620004SJunchao Zhang     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1220cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1221cd620004SJunchao Zhang   }
1222cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1223cd620004SJunchao Zhang   PetscFunctionReturn(0);
1224cd620004SJunchao Zhang }
1225cd620004SJunchao Zhang 
1226cd620004SJunchao Zhang /* Unpack rootbuf to rootdata */
1227cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1228cd620004SJunchao Zhang {
1229cd620004SJunchao Zhang   PetscErrorCode   ierr;
1230cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1231cd620004SJunchao Zhang   PetscInt         count,start;
1232cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1233fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1234cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1235fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1236cd620004SJunchao Zhang 
1237cd620004SJunchao Zhang   PetscFunctionBegin;
1238cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1239cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1240cd620004SJunchao Zhang   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1241cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1242cd620004SJunchao Zhang     if (UnpackAndOp) {
1243fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1244fcc7397dSJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1245cd620004SJunchao Zhang     } else {
1246fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1247cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1248cd620004SJunchao Zhang     }
1249cd620004SJunchao Zhang   }
1250cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1251cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1252cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1253cd620004SJunchao Zhang   PetscFunctionReturn(0);
1254cd620004SJunchao Zhang }
1255cd620004SJunchao Zhang 
1256cd620004SJunchao Zhang /* Unpack leafbuf to leafdata */
1257cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1258cd620004SJunchao Zhang {
1259cd620004SJunchao Zhang   PetscErrorCode   ierr;
1260cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1261cd620004SJunchao Zhang   PetscInt         count,start;
1262fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1263cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1264fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1265cd620004SJunchao Zhang 
1266cd620004SJunchao Zhang   PetscFunctionBegin;
1267cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1268cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1269cd620004SJunchao Zhang   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1270cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1271cd620004SJunchao Zhang     if (UnpackAndOp) {
1272fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1273fcc7397dSJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1274cd620004SJunchao Zhang     } else {
1275fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1276cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1277cd620004SJunchao Zhang     }
1278cd620004SJunchao Zhang   }
1279cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1280cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1281cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1282cd620004SJunchao Zhang   PetscFunctionReturn(0);
1283cd620004SJunchao Zhang }
1284cd620004SJunchao Zhang 
1285cd620004SJunchao Zhang /* FetchAndOp rootdata with rootbuf */
1286cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1287cd620004SJunchao Zhang {
1288cd620004SJunchao Zhang   PetscErrorCode     ierr;
1289cd620004SJunchao Zhang   const PetscInt     *rootindices = NULL;
1290cd620004SJunchao Zhang   PetscInt           count,start;
1291cd620004SJunchao Zhang   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1292fcc7397dSJunchao Zhang   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1293cd620004SJunchao Zhang   PetscMemType       rootmtype = link->rootmtype;
1294fcc7397dSJunchao Zhang   PetscSFPackOpt     opt = NULL;
1295cd620004SJunchao Zhang 
1296cd620004SJunchao Zhang   PetscFunctionBegin;
1297cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1298cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1299cd620004SJunchao Zhang   if (bas->rootbuflen[scope]) {
1300cd620004SJunchao Zhang     /* Do FetchAndOp on rootdata with rootbuf */
1301cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1302fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1303fcc7397dSJunchao Zhang     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1304cd620004SJunchao Zhang   }
1305cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1306cd620004SJunchao Zhang     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1307cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1308cd620004SJunchao Zhang   }
1309cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1310cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1311cd620004SJunchao Zhang   PetscFunctionReturn(0);
1312cd620004SJunchao Zhang }
1313cd620004SJunchao Zhang 
1314cd620004SJunchao Zhang /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1315cd620004SJunchao Zhang PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1316cd620004SJunchao Zhang {
1317cd620004SJunchao Zhang   PetscErrorCode       ierr;
1318cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1319cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1320cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1321fcc7397dSJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1322cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1323fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1324cd620004SJunchao Zhang 
1325cd620004SJunchao Zhang   PetscFunctionBegin;
1326cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1327cd620004SJunchao Zhang   if (rootmtype != leafmtype) { /* Uncommon case */
1328cd620004SJunchao Zhang      /* The local communication has to go through pack and unpack */
1329cd620004SJunchao Zhang     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1330f01131f0SJunchao Zhang     ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1331cd620004SJunchao Zhang     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1332cd620004SJunchao Zhang   } else {
1333cd620004SJunchao Zhang     ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1334cd620004SJunchao Zhang     if (ScatterAndOp) {
1335fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1336fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1337fcc7397dSJunchao Zhang       ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1338cd620004SJunchao Zhang     } else {
1339fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1340fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1341fcc7397dSJunchao Zhang       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1342cd620004SJunchao Zhang     }
1343cd620004SJunchao Zhang   }
1344cd620004SJunchao Zhang   PetscFunctionReturn(0);
1345cd620004SJunchao Zhang }
1346cd620004SJunchao Zhang 
1347cd620004SJunchao Zhang /* Reduce leafdata to rootdata locally */
1348cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1349cd620004SJunchao Zhang {
1350cd620004SJunchao Zhang   PetscErrorCode       ierr;
1351cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1352cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1353cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1354fcc7397dSJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1355cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1356fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1357cd620004SJunchao Zhang 
1358cd620004SJunchao Zhang   PetscFunctionBegin;
1359cd620004SJunchao Zhang   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1360cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1361cd620004SJunchao Zhang     /* The local communication has to go through pack and unpack */
1362cd620004SJunchao Zhang     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1363f01131f0SJunchao Zhang     ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1364cd620004SJunchao Zhang     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1365cd620004SJunchao Zhang   } else {
1366cd620004SJunchao Zhang     ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1367cd620004SJunchao Zhang     if (ScatterAndOp) {
1368fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1369fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1370fcc7397dSJunchao Zhang       ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1371cd620004SJunchao Zhang     } else {
1372fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1373fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1374fcc7397dSJunchao Zhang       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1375cd620004SJunchao Zhang     }
1376cd620004SJunchao Zhang   }
1377cd620004SJunchao Zhang   PetscFunctionReturn(0);
1378cd620004SJunchao Zhang }
1379cd620004SJunchao Zhang 
1380cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */
1381cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1382cd620004SJunchao Zhang {
1383cd620004SJunchao Zhang   PetscErrorCode       ierr;
1384cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1385cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1386cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1387fcc7397dSJunchao Zhang   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1388cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1389fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1390cd620004SJunchao Zhang 
1391cd620004SJunchao Zhang   PetscFunctionBegin;
1392cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1393cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1394cd620004SJunchao Zhang    /* The local communication has to go through pack and unpack */
1395cd620004SJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1396cd620004SJunchao Zhang   } else {
1397fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1398fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1399cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1400fcc7397dSJunchao Zhang     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1401cd620004SJunchao Zhang   }
140240e23c03SJunchao Zhang   PetscFunctionReturn(0);
140340e23c03SJunchao Zhang }
140440e23c03SJunchao Zhang 
140540e23c03SJunchao Zhang /*
1406cd620004SJunchao Zhang   Create per-rank pack/unpack optimizations based on indice patterns
140740e23c03SJunchao Zhang 
140840e23c03SJunchao Zhang    Input Parameters:
1409fcc7397dSJunchao Zhang   +  n       - Number of destination ranks
1410eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1411b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
141240e23c03SJunchao Zhang 
141340e23c03SJunchao Zhang    Output Parameters:
1414cd620004SJunchao Zhang   +  opt     - Pack optimizations. NULL if no optimizations.
141540e23c03SJunchao Zhang */
1416cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
141740e23c03SJunchao Zhang {
141840e23c03SJunchao Zhang   PetscErrorCode ierr;
1419fcc7397dSJunchao Zhang   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1420fcc7397dSJunchao Zhang   PetscBool      optimizable = PETSC_TRUE;
142140e23c03SJunchao Zhang   PetscSFPackOpt opt;
142240e23c03SJunchao Zhang 
142340e23c03SJunchao Zhang   PetscFunctionBegin;
1424fcc7397dSJunchao Zhang   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1425fcc7397dSJunchao Zhang   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1426fcc7397dSJunchao Zhang   opt->n      = opt->array[0] = n;
1427fcc7397dSJunchao Zhang   opt->offset = opt->array + 1;
1428fcc7397dSJunchao Zhang   opt->start  = opt->array + n   + 2;
1429fcc7397dSJunchao Zhang   opt->dx     = opt->array + 2*n + 2;
1430fcc7397dSJunchao Zhang   opt->dy     = opt->array + 3*n + 2;
1431fcc7397dSJunchao Zhang   opt->dz     = opt->array + 4*n + 2;
1432fcc7397dSJunchao Zhang   opt->X      = opt->array + 5*n + 2;
1433fcc7397dSJunchao Zhang   opt->Y      = opt->array + 6*n + 2;
1434fcc7397dSJunchao Zhang 
1435fcc7397dSJunchao Zhang   for (r=0; r<n; r++) { /* For each destination rank */
1436fcc7397dSJunchao Zhang     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1437fcc7397dSJunchao Zhang     p     = offset[r];
1438fcc7397dSJunchao Zhang     start = idx[p]; /* First index for this rank */
1439fcc7397dSJunchao Zhang     p++;
1440fcc7397dSJunchao Zhang 
1441fcc7397dSJunchao Zhang     /* Search in X dimension */
1442fcc7397dSJunchao Zhang     for (dx=1; dx<m; dx++,p++) {
1443fcc7397dSJunchao Zhang       if (start+dx != idx[p]) break;
1444b23bfdefSJunchao Zhang     }
1445b23bfdefSJunchao Zhang 
1446fcc7397dSJunchao Zhang     dydz = m/dx;
1447fcc7397dSJunchao Zhang     X    = dydz > 1 ? (idx[p]-start) : dx;
1448fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1449fcc7397dSJunchao Zhang     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1450fcc7397dSJunchao Zhang     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1451fcc7397dSJunchao Zhang       for (i=0; i<dx; i++,p++) {
1452fcc7397dSJunchao Zhang         if (start+X*dy+i != idx[p]) {
1453fcc7397dSJunchao Zhang           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1454fcc7397dSJunchao Zhang           else goto Z_dimension;
145540e23c03SJunchao Zhang         }
145640e23c03SJunchao Zhang       }
145740e23c03SJunchao Zhang     }
145840e23c03SJunchao Zhang 
1459fcc7397dSJunchao Zhang Z_dimension:
1460fcc7397dSJunchao Zhang     dz = m/(dx*dy);
1461fcc7397dSJunchao Zhang     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1462fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1463fcc7397dSJunchao Zhang     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1464fcc7397dSJunchao Zhang     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1465fcc7397dSJunchao Zhang       for (j=0; j<dy; j++) {
1466fcc7397dSJunchao Zhang         for (i=0; i<dx; i++,p++) {
1467fcc7397dSJunchao Zhang           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
146840e23c03SJunchao Zhang         }
146940e23c03SJunchao Zhang       }
147040e23c03SJunchao Zhang     }
1471fcc7397dSJunchao Zhang     opt->start[r] = start;
1472fcc7397dSJunchao Zhang     opt->dx[r]    = dx;
1473fcc7397dSJunchao Zhang     opt->dy[r]    = dy;
1474fcc7397dSJunchao Zhang     opt->dz[r]    = dz;
1475fcc7397dSJunchao Zhang     opt->X[r]     = X;
1476fcc7397dSJunchao Zhang     opt->Y[r]     = Y;
147740e23c03SJunchao Zhang   }
147840e23c03SJunchao Zhang 
1479fcc7397dSJunchao Zhang finish:
1480fcc7397dSJunchao Zhang   /* If not optimizable, free arrays to save memory */
1481fcc7397dSJunchao Zhang   if (!n || !optimizable) {
1482fcc7397dSJunchao Zhang     ierr = PetscFree(opt->array);CHKERRQ(ierr);
148340e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
148440e23c03SJunchao Zhang     *out = NULL;
1485fcc7397dSJunchao Zhang   } else {
1486fcc7397dSJunchao Zhang     opt->offset[0] = 0;
1487fcc7397dSJunchao Zhang     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1488fcc7397dSJunchao Zhang     *out = opt;
1489fcc7397dSJunchao Zhang   }
149040e23c03SJunchao Zhang   PetscFunctionReturn(0);
149140e23c03SJunchao Zhang }
149240e23c03SJunchao Zhang 
1493fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out)
149440e23c03SJunchao Zhang {
149540e23c03SJunchao Zhang   PetscErrorCode ierr;
149640e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
149740e23c03SJunchao Zhang 
149840e23c03SJunchao Zhang   PetscFunctionBegin;
149940e23c03SJunchao Zhang   if (opt) {
1500*7fd2d3dbSJunchao Zhang     ierr = PetscSFFree(mtype,opt->array);CHKERRQ(ierr);
150140e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
150240e23c03SJunchao Zhang     *out = NULL;
150340e23c03SJunchao Zhang   }
150440e23c03SJunchao Zhang   PetscFunctionReturn(0);
150540e23c03SJunchao Zhang }
1506cd620004SJunchao Zhang 
1507cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1508cd620004SJunchao Zhang {
1509cd620004SJunchao Zhang   PetscErrorCode ierr;
1510cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1511cd620004SJunchao Zhang   PetscInt       i,j;
1512cd620004SJunchao Zhang 
1513cd620004SJunchao Zhang   PetscFunctionBegin;
1514cd620004SJunchao Zhang   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1515cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Set defaults */
1516cd620004SJunchao Zhang     sf->leafstart[i]   = 0;
1517cd620004SJunchao Zhang     sf->leafcontig[i]  = PETSC_TRUE;
1518cd620004SJunchao Zhang     sf->leafdups[i]    = PETSC_FALSE;
1519cd620004SJunchao Zhang     bas->rootstart[i]  = 0;
1520cd620004SJunchao Zhang     bas->rootcontig[i] = PETSC_TRUE;
1521cd620004SJunchao Zhang     bas->rootdups[i]   = PETSC_FALSE;
1522cd620004SJunchao Zhang   }
1523cd620004SJunchao Zhang 
1524cd620004SJunchao Zhang   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1525cd620004SJunchao Zhang   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1526cd620004SJunchao Zhang 
1527cd620004SJunchao Zhang   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1528cd620004SJunchao Zhang   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1529cd620004SJunchao Zhang 
1530cd620004SJunchao Zhang   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1531cd620004SJunchao Zhang   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1532cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1533cd620004SJunchao Zhang   }
1534cd620004SJunchao Zhang   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1535cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1536cd620004SJunchao Zhang   }
1537cd620004SJunchao Zhang 
1538cd620004SJunchao Zhang   /* If not, see if we can have per-rank optimizations by doing index analysis */
1539cd620004SJunchao Zhang   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1540cd620004SJunchao Zhang   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1541cd620004SJunchao Zhang 
1542cd620004SJunchao Zhang   /* Are root indices for self and remote contiguous? */
1543cd620004SJunchao Zhang   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1544cd620004SJunchao Zhang   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1545cd620004SJunchao Zhang 
1546cd620004SJunchao Zhang   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1547cd620004SJunchao Zhang   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1548cd620004SJunchao Zhang 
1549cd620004SJunchao Zhang   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1550cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1551cd620004SJunchao Zhang   }
1552cd620004SJunchao Zhang   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1553cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1554cd620004SJunchao Zhang   }
1555cd620004SJunchao Zhang 
1556cd620004SJunchao Zhang   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1557cd620004SJunchao Zhang   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1558cd620004SJunchao Zhang 
1559*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1560cd620004SJunchao Zhang     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1561cd620004SJunchao Zhang   if (!sf->leafcontig[0])  {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1562cd620004SJunchao Zhang   if (!sf->leafcontig[1])  {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1563cd620004SJunchao Zhang   if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1564cd620004SJunchao Zhang   if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1565cd620004SJunchao Zhang #endif
1566cd620004SJunchao Zhang 
1567cd620004SJunchao Zhang   PetscFunctionReturn(0);
1568cd620004SJunchao Zhang }
1569cd620004SJunchao Zhang 
1570cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1571cd620004SJunchao Zhang {
1572cd620004SJunchao Zhang   PetscErrorCode ierr;
1573cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1574cd620004SJunchao Zhang   PetscInt       i;
1575cd620004SJunchao Zhang 
1576cd620004SJunchao Zhang   PetscFunctionBegin;
1577cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1578fcc7397dSJunchao Zhang     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1579fcc7397dSJunchao Zhang     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1580*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1581*7fd2d3dbSJunchao Zhang     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1582fcc7397dSJunchao Zhang     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1583*7fd2d3dbSJunchao Zhang #endif
1584cd620004SJunchao Zhang   }
1585cd620004SJunchao Zhang   PetscFunctionReturn(0);
1586cd620004SJunchao Zhang }
1587