xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 4a3144197e09a67fc286772231bf9f1535f83912)
140e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
340e23c03SJunchao Zhang 
4eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
5eb02082bSJunchao Zhang #include <cuda_runtime.h>
6eb02082bSJunchao Zhang #endif
740e23c03SJunchao Zhang /*
840e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
940e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1040e23c03SJunchao Zhang  */
1140e23c03SJunchao Zhang 
12cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
1340e23c03SJunchao Zhang 
14cd620004SJunchao Zhang /* Operations working like s += t */
15cd620004SJunchao Zhang #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
16cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
17cd620004SJunchao Zhang #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
18cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while(0)
19cd620004SJunchao Zhang /* Ref MPI MAXLOC */
20cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \
21cd620004SJunchao Zhang   do {                                       \
22cd620004SJunchao Zhang     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
23cd620004SJunchao Zhang     else if (!((s).u op (t).u)) s = t;           \
24cd620004SJunchao Zhang   } while(0)
2540e23c03SJunchao Zhang 
2640e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
2740e23c03SJunchao Zhang 
2840e23c03SJunchao Zhang    Arguments of the macro:
29b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30fcc7397dSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bsz.
31b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
3240e23c03SJunchao Zhang 
3340e23c03SJunchao Zhang    Arguments of the Pack routine:
34cd620004SJunchao Zhang    +count     Number of indices in idx[].
35fcc7397dSJunchao Zhang    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
36fcc7397dSJunchao Zhang    .opt       Per-pack optimization plan. NULL means no such plan.
37fcc7397dSJunchao Zhang    .idx       Indices of entries to packed.
38eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39cd620004SJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
40cd620004SJunchao Zhang    -packed    Address of the packed data.
4140e23c03SJunchao Zhang  */
42b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \
43fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
44b23bfdefSJunchao Zhang   {                                                                                                          \
4540e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
46b23bfdefSJunchao Zhang     const Type     *u = (const Type*)unpacked,*u2;                                                           \
47b23bfdefSJunchao Zhang     Type           *p = (Type*)packed,*p2;                                                                   \
48fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
49fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
50b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
5140e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
52fcc7397dSJunchao Zhang     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
53fcc7397dSJunchao Zhang     else if (opt) { /* has optimizations available */                                                        \
54fcc7397dSJunchao Zhang       p2 = p;                                                                                                \
55fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
56fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
57fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
58fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
59fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
60fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
61fcc7397dSJunchao Zhang             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
62fcc7397dSJunchao Zhang             p2  += opt->dx[r]*MBS;                                                                           \
63fcc7397dSJunchao Zhang           }                                                                                                  \
64fcc7397dSJunchao Zhang       }                                                                                                      \
65fcc7397dSJunchao Zhang     } else {                                                                                                 \
66b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
67eb02082bSJunchao Zhang         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
68eb02082bSJunchao Zhang           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
69b23bfdefSJunchao Zhang             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
7040e23c03SJunchao Zhang     }                                                                                                        \
7140e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
7240e23c03SJunchao Zhang   }
7340e23c03SJunchao Zhang 
74cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
75cd620004SJunchao Zhang                 and inserts into a sparse array.
7640e23c03SJunchao Zhang 
7740e23c03SJunchao Zhang    Arguments:
78b23bfdefSJunchao Zhang   .Type       Type of the data
7940e23c03SJunchao Zhang   .BS         Block size for vectorization
80b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
8140e23c03SJunchao Zhang 
8240e23c03SJunchao Zhang   Notes:
8340e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
8440e23c03SJunchao Zhang  */
85cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ)               \
86fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
87b23bfdefSJunchao Zhang   {                                                                                                          \
8840e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
89b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
90fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
91fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
92fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
93b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
9440e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
95b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
96fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
97fcc7397dSJunchao Zhang       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
98fcc7397dSJunchao Zhang     } else if (opt) { /* has optimizations available */                                                      \
99fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
100fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
101fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
102fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
103fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
104fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
105fcc7397dSJunchao Zhang             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
106fcc7397dSJunchao Zhang             p   += opt->dx[r]*MBS;                                                                           \
107fcc7397dSJunchao Zhang           }                                                                                                  \
108fcc7397dSJunchao Zhang       }                                                                                                      \
109fcc7397dSJunchao Zhang     } else {                                                                                                 \
110b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
111b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
112cd620004SJunchao Zhang           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
11340e23c03SJunchao Zhang     }                                                                                                        \
11440e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
11540e23c03SJunchao Zhang   }
11640e23c03SJunchao Zhang 
117cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
11840e23c03SJunchao Zhang 
11940e23c03SJunchao Zhang    Arguments:
120cd620004SJunchao Zhang   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
121b23bfdefSJunchao Zhang   .Type       Type of the data
12240e23c03SJunchao Zhang   .BS         Block size for vectorization
123b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
124cd620004SJunchao Zhang   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
125cd620004SJunchao Zhang   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
12640e23c03SJunchao Zhang  */
127cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
128fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
129b23bfdefSJunchao Zhang   {                                                                                                          \
130cd620004SJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
131fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
132fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
133fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
134b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
13540e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
136b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
137fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
138cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
139cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
140cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
141cd620004SJunchao Zhang             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
142fcc7397dSJunchao Zhang     } else if (opt) { /* idx[] has patterns */                                                               \
143fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
144fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
145fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
146fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
147fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
148fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
149fcc7397dSJunchao Zhang             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
150fcc7397dSJunchao Zhang             p += opt->dx[r]*MBS;                                                                             \
151fcc7397dSJunchao Zhang           }                                                                                                  \
152fcc7397dSJunchao Zhang       }                                                                                                      \
153fcc7397dSJunchao Zhang     } else {                                                                                                 \
154cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
155cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
156cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
157cd620004SJunchao Zhang             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
158cd620004SJunchao Zhang     }                                                                                                        \
159cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
160cd620004SJunchao Zhang   }
161cd620004SJunchao Zhang 
162cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
163fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
164cd620004SJunchao Zhang   {                                                                                                          \
165fcc7397dSJunchao Zhang     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
166fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
167fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
168fcc7397dSJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
169cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
170fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
171fcc7397dSJunchao Zhang       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
172fcc7397dSJunchao Zhang       l = i*MBS;                                                                                             \
173b23bfdefSJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
174b23bfdefSJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
175fcc7397dSJunchao Zhang           tmp = u[r+j*BS+k];                                                                                 \
176fcc7397dSJunchao Zhang           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
177fcc7397dSJunchao Zhang           p[l+j*BS+k] = tmp;                                                                                 \
178cd620004SJunchao Zhang         }                                                                                                    \
179cd620004SJunchao Zhang     }                                                                                                        \
180cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
181cd620004SJunchao Zhang   }
182cd620004SJunchao Zhang 
183cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
184fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
185cd620004SJunchao Zhang   {                                                                                                          \
186fcc7397dSJunchao Zhang     PetscErrorCode ierr;                                                                                     \
187fcc7397dSJunchao Zhang     const Type     *u = (const Type*)src;                                                                    \
188fcc7397dSJunchao Zhang     Type           *v = (Type*)dst;                                                                          \
189fcc7397dSJunchao Zhang     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
190cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
191cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
192cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
193fcc7397dSJunchao Zhang     if (!srcIdx) { /* src is contiguous */                                                                   \
194fcc7397dSJunchao Zhang       u += srcStart*MBS;                                                                                     \
195fcc7397dSJunchao Zhang       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
196fcc7397dSJunchao Zhang     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
197fcc7397dSJunchao Zhang       u += srcOpt->start[0]*MBS;                                                                             \
198fcc7397dSJunchao Zhang       v += dstStart*MBS;                                                                                     \
199fcc7397dSJunchao Zhang       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
200fcc7397dSJunchao Zhang       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
201fcc7397dSJunchao Zhang         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
202fcc7397dSJunchao Zhang           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
203fcc7397dSJunchao Zhang           v += srcOpt->dx[0]*MBS;                                                                            \
204fcc7397dSJunchao Zhang         }                                                                                                    \
205fcc7397dSJunchao Zhang     } else { /* all other cases */                                                                           \
206fcc7397dSJunchao Zhang       for (i=0; i<count; i++) {                                                                              \
207fcc7397dSJunchao Zhang         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
208fcc7397dSJunchao Zhang         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
209cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
210fcc7397dSJunchao Zhang           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
211fcc7397dSJunchao Zhang       }                                                                                                      \
212cd620004SJunchao Zhang     }                                                                                                        \
213cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
214cd620004SJunchao Zhang   }
215cd620004SJunchao Zhang 
216cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
217fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
218cd620004SJunchao Zhang   {                                                                                                          \
219fcc7397dSJunchao Zhang     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
220fcc7397dSJunchao Zhang     const Type     *ldata = (const Type*)leafdata;                                                           \
221fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
222cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
223cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
224cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
225fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
226fcc7397dSJunchao Zhang       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
227fcc7397dSJunchao Zhang       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
228cd620004SJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
229cd620004SJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
230fcc7397dSJunchao Zhang           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
231fcc7397dSJunchao Zhang           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
23240e23c03SJunchao Zhang         }                                                                                                    \
23340e23c03SJunchao Zhang     }                                                                                                        \
23440e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
23540e23c03SJunchao Zhang   }
23640e23c03SJunchao Zhang 
237b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
238b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ)                                                      \
239b23bfdefSJunchao Zhang   DEF_PackFunc(Type,BS,EQ)                                                        \
240cd620004SJunchao Zhang   DEF_UnpackFunc(Type,BS,EQ)                                                      \
241cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
242cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
243eb02082bSJunchao Zhang     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
244eb02082bSJunchao Zhang     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
245cd620004SJunchao Zhang     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
24640e23c03SJunchao Zhang   }
24740e23c03SJunchao Zhang 
248b23bfdefSJunchao Zhang /* Add, Mult ops */
249b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ)                                                       \
250cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
251cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
252cd620004SJunchao Zhang   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
253cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
254cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
255cd620004SJunchao Zhang   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
256cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
257eb02082bSJunchao Zhang     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
258eb02082bSJunchao Zhang     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
259eb02082bSJunchao Zhang     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
260cd620004SJunchao Zhang     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
261cd620004SJunchao Zhang     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
262cd620004SJunchao Zhang     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
26340e23c03SJunchao Zhang   }
26440e23c03SJunchao Zhang 
265b23bfdefSJunchao Zhang /* Max, Min ops */
266b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ)                                                       \
267cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
268cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
269cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
270cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
271cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
272eb02082bSJunchao Zhang     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
273eb02082bSJunchao Zhang     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
274cd620004SJunchao Zhang     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
275cd620004SJunchao Zhang     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
276b23bfdefSJunchao Zhang   }
277b23bfdefSJunchao Zhang 
278b23bfdefSJunchao Zhang /* Logical ops.
279cd620004SJunchao Zhang   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
28040e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
28140e23c03SJunchao Zhang  */
282b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ)                                                       \
283cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
284cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
285cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
286cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
287cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
288cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
289cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
290eb02082bSJunchao Zhang     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
291eb02082bSJunchao Zhang     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
292eb02082bSJunchao Zhang     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
293cd620004SJunchao Zhang     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
294cd620004SJunchao Zhang     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
295cd620004SJunchao Zhang     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
29640e23c03SJunchao Zhang   }
29740e23c03SJunchao Zhang 
298b23bfdefSJunchao Zhang /* Bitwise ops */
299b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ)                                                       \
300cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
301cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
302cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
303cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
304cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
305cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
306cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
307eb02082bSJunchao Zhang     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
308eb02082bSJunchao Zhang     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
309eb02082bSJunchao Zhang     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
310cd620004SJunchao Zhang     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
311cd620004SJunchao Zhang     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
312cd620004SJunchao Zhang     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
31340e23c03SJunchao Zhang   }
31440e23c03SJunchao Zhang 
315cd620004SJunchao Zhang /* Maxloc, Minloc ops */
316cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ)                                                      \
317cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
318cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
319cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
320cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
321cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
322cd620004SJunchao Zhang     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
323cd620004SJunchao Zhang     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
324cd620004SJunchao Zhang     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
325cd620004SJunchao Zhang     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
32640e23c03SJunchao Zhang   }
32740e23c03SJunchao Zhang 
328b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ)                                               \
329b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
330b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
331b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
332b23bfdefSJunchao Zhang   DEF_Log(Type,BS,EQ)                                                             \
333b23bfdefSJunchao Zhang   DEF_Bit(Type,BS,EQ)                                                             \
334cd620004SJunchao Zhang   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
335b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
336b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
337b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
338b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
339b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
34040e23c03SJunchao Zhang   }
34140e23c03SJunchao Zhang 
342b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ)                                                  \
343b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
344b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
345b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
346cd620004SJunchao Zhang   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
347b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
348b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
349b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
350b23bfdefSJunchao Zhang   }
35140e23c03SJunchao Zhang 
35240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
353b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ)                                               \
354b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
355b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
356cd620004SJunchao Zhang   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
357b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
358b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
359b23bfdefSJunchao Zhang   }
36040e23c03SJunchao Zhang #endif
361b23bfdefSJunchao Zhang 
362b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ)                                                  \
363b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
364cd620004SJunchao Zhang   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
365b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
366b23bfdefSJunchao Zhang   }
367b23bfdefSJunchao Zhang 
368b23bfdefSJunchao Zhang /* Maxloc, Minloc */
369cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ)                                                  \
370cd620004SJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
371cd620004SJunchao Zhang   DEF_Xloc(Type,BS,EQ)                                                            \
372cd620004SJunchao Zhang   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
373cd620004SJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
374cd620004SJunchao Zhang     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
375b23bfdefSJunchao Zhang   }
376b23bfdefSJunchao Zhang 
377b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
378b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
379b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
380b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
381b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
382b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
383b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
384b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
385b23bfdefSJunchao Zhang 
386b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
387b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1)
388b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1)
389b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1)
390b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1)
391b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0)
392b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0)
393b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0)
394b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0)
395b23bfdefSJunchao Zhang #endif
396b23bfdefSJunchao Zhang 
397b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */
398b23bfdefSJunchao Zhang typedef signed char SignedChar;
399b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1)
400b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1)
401b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1)
402b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1)
403b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0)
404b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0)
405b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0)
406b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0)
407b23bfdefSJunchao Zhang 
408b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar;
409b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1)
410b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1)
411b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1)
412b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1)
413b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0)
414b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0)
415b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0)
416b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0)
417b23bfdefSJunchao Zhang 
418b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1)
419b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1)
420b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1)
421b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1)
422b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0)
423b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0)
424b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0)
425b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0)
426b23bfdefSJunchao Zhang 
427b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
428b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1)
429b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1)
430b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1)
431b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1)
432b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0)
433b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0)
434b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0)
435b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0)
436b23bfdefSJunchao Zhang #endif
437b23bfdefSJunchao Zhang 
438cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2
439cd620004SJunchao Zhang typedef struct {int u; int i;}           PairType(int,int);
440cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
441cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1)
442cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1)
443b23bfdefSJunchao Zhang 
444b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */
445b23bfdefSJunchao Zhang DEF_DumbType(char,1,1)
446b23bfdefSJunchao Zhang DEF_DumbType(char,2,1)
447b23bfdefSJunchao Zhang DEF_DumbType(char,4,1)
448b23bfdefSJunchao Zhang DEF_DumbType(char,1,0)
449b23bfdefSJunchao Zhang DEF_DumbType(char,2,0)
450b23bfdefSJunchao Zhang DEF_DumbType(char,4,0)
451b23bfdefSJunchao Zhang 
452eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
453b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1)
454b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1)
455b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1)
456b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1)
457b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0)
458b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0)
459b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0)
460b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0)
46140e23c03SJunchao Zhang 
46240e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
46340e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
46440e23c03SJunchao Zhang {
46540e23c03SJunchao Zhang   int ierr;
46640e23c03SJunchao Zhang   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
46740e23c03SJunchao Zhang   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
46840e23c03SJunchao Zhang   return MPI_SUCCESS;
46940e23c03SJunchao Zhang }
47040e23c03SJunchao Zhang #endif
47140e23c03SJunchao Zhang 
472cd620004SJunchao Zhang /*
473cd620004SJunchao Zhang    The routine Creates a communication link for the given operation. It first looks up its link cache. If
474cd620004SJunchao Zhang    there is a free & suitable one, it uses it. Otherwise it creates a new one.
475cd620004SJunchao Zhang 
476cd620004SJunchao Zhang    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
477cd620004SJunchao Zhang    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
478cd620004SJunchao Zhang    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
479cd620004SJunchao Zhang    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
480cd620004SJunchao Zhang 
481cd620004SJunchao Zhang    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
482cd620004SJunchao Zhang 
483cd620004SJunchao Zhang    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
484cd620004SJunchao Zhang 
485cd620004SJunchao Zhang    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
486cd620004SJunchao Zhang    need pack/unpack data.
487cd620004SJunchao Zhang */
488cd620004SJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
48940e23c03SJunchao Zhang {
49040e23c03SJunchao Zhang   PetscErrorCode    ierr;
491cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
492cd620004SJunchao Zhang   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
493cd620004SJunchao Zhang   PetscSFLink       *p,link;
494cd620004SJunchao Zhang   PetscSFDirection  direction;
495cd620004SJunchao Zhang   MPI_Request       *reqs = NULL;
496cd620004SJunchao Zhang   PetscBool         match,rootdirect[2],leafdirect[2];
497cd620004SJunchao Zhang   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
498cd620004SJunchao Zhang   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
499cd620004SJunchao Zhang 
500cd620004SJunchao Zhang   PetscFunctionBegin;
501cd620004SJunchao Zhang   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
502cd620004SJunchao Zhang 
503cd620004SJunchao Zhang   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
504cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
505cd620004SJunchao Zhang     if (sfop == PETSCSF_BCAST) {
506cd620004SJunchao Zhang       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
507cd620004SJunchao Zhang       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
508cd620004SJunchao Zhang     } else if (sfop == PETSCSF_REDUCE) {
509cd620004SJunchao Zhang       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
510cd620004SJunchao Zhang       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
511cd620004SJunchao Zhang     } else { /* PETSCSF_FETCH */
512cd620004SJunchao Zhang       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
513cd620004SJunchao Zhang       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
514cd620004SJunchao Zhang     }
515cd620004SJunchao Zhang   }
516cd620004SJunchao Zhang 
517c2a741eeSJunchao Zhang   if (sf->use_gpu_aware_mpi) {
518cd620004SJunchao Zhang     rootmtype_mpi = rootmtype;
519cd620004SJunchao Zhang     leafmtype_mpi = leafmtype;
520cd620004SJunchao Zhang   } else {
521cd620004SJunchao Zhang     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
522cd620004SJunchao Zhang   }
523cd620004SJunchao Zhang   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
524cd620004SJunchao Zhang   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
525cd620004SJunchao Zhang   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
526cd620004SJunchao Zhang 
527cd620004SJunchao Zhang   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
528cd620004SJunchao Zhang   nrootreqs = bas->nrootreqs;
529cd620004SJunchao Zhang   nleafreqs = sf->nleafreqs;
530cd620004SJunchao Zhang 
531cd620004SJunchao Zhang   /* Look for free links in cache */
532cd620004SJunchao Zhang   for (p=&bas->avail; (link=*p); p=&link->next) {
533cd620004SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
534cd620004SJunchao Zhang     if (match) {
535cd620004SJunchao Zhang       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
536cd620004SJunchao Zhang          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
537cd620004SJunchao Zhang       */
538cd620004SJunchao Zhang       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
539cd620004SJunchao Zhang         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
540cd620004SJunchao Zhang         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
541cd620004SJunchao Zhang         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
542cd620004SJunchao Zhang       }
543cd620004SJunchao Zhang       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
544cd620004SJunchao Zhang         reqs = link->leafreqs[direction][leafmtype][1];
545cd620004SJunchao Zhang         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
546cd620004SJunchao Zhang         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
547cd620004SJunchao Zhang       }
548cd620004SJunchao Zhang       *p = link->next; /* Remove from available list */
549cd620004SJunchao Zhang       goto found;
550cd620004SJunchao Zhang     }
551cd620004SJunchao Zhang   }
552cd620004SJunchao Zhang 
553cd620004SJunchao Zhang   ierr = PetscNew(&link);CHKERRQ(ierr);
554cd620004SJunchao Zhang   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
555cd620004SJunchao Zhang   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
556cd620004SJunchao Zhang 
557cd620004SJunchao Zhang   nreqs = (nrootreqs+nleafreqs)*8;
558cd620004SJunchao Zhang   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
559cd620004SJunchao Zhang   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
560cd620004SJunchao Zhang 
561cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Two communication directions */
562cd620004SJunchao Zhang     for (j=0; j<2; j++) { /* Two memory types */
563cd620004SJunchao Zhang       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
564cd620004SJunchao Zhang         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
565cd620004SJunchao Zhang         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
566cd620004SJunchao Zhang       }
567cd620004SJunchao Zhang     }
568cd620004SJunchao Zhang   }
569cd620004SJunchao Zhang 
570cd620004SJunchao Zhang found:
571cd620004SJunchao Zhang   if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);}
572cd620004SJunchao Zhang 
573cd620004SJunchao Zhang   /* Allocate buffers along root/leafdata */
574cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
575cd620004SJunchao Zhang     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
576cd620004SJunchao Zhang     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
577cd620004SJunchao Zhang     if (bas->rootbuflen[i]) {
578cd620004SJunchao Zhang       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
579cd620004SJunchao Zhang         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
580cd620004SJunchao Zhang       } else { /* Have to have a separate rootbuf */
581cd620004SJunchao Zhang         if (!link->rootbuf_alloc[i][rootmtype]) {
582cd620004SJunchao Zhang           ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
583cd620004SJunchao Zhang         }
584cd620004SJunchao Zhang         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
585cd620004SJunchao Zhang       }
586cd620004SJunchao Zhang     }
587cd620004SJunchao Zhang 
588cd620004SJunchao Zhang     if (sf->leafbuflen[i]) {
589cd620004SJunchao Zhang       if (leafdirect[i]) {
590cd620004SJunchao Zhang         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
591cd620004SJunchao Zhang       } else {
592cd620004SJunchao Zhang         if (!link->leafbuf_alloc[i][leafmtype]) {
593cd620004SJunchao Zhang           ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
594cd620004SJunchao Zhang         }
595cd620004SJunchao Zhang         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
596cd620004SJunchao Zhang       }
597cd620004SJunchao Zhang     }
598cd620004SJunchao Zhang   }
599cd620004SJunchao Zhang 
600cd620004SJunchao Zhang   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
601cd620004SJunchao Zhang   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
602cd620004SJunchao Zhang     if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
603cd620004SJunchao Zhang       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
604cd620004SJunchao Zhang     }
605cd620004SJunchao Zhang     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
606cd620004SJunchao Zhang   }
607cd620004SJunchao Zhang   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
608cd620004SJunchao Zhang     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
609cd620004SJunchao Zhang       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
610cd620004SJunchao Zhang     }
611cd620004SJunchao Zhang     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
612cd620004SJunchao Zhang   }
613cd620004SJunchao Zhang 
614cd620004SJunchao Zhang   /* Set `current` state of the link. They may change between different SF invocations with the same link */
615cd620004SJunchao Zhang   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
616cd620004SJunchao Zhang     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
617cd620004SJunchao Zhang     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
618cd620004SJunchao Zhang   }
619cd620004SJunchao Zhang 
620cd620004SJunchao Zhang   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
621cd620004SJunchao Zhang   link->leafdata = leafdata;
622cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
623cd620004SJunchao Zhang     link->rootdirect[i] = rootdirect[i];
624cd620004SJunchao Zhang     link->leafdirect[i] = leafdirect[i];
625cd620004SJunchao Zhang   }
626cd620004SJunchao Zhang   link->rootdirect_mpi  = rootdirect_mpi;
627cd620004SJunchao Zhang   link->leafdirect_mpi  = leafdirect_mpi;
628cd620004SJunchao Zhang   link->rootmtype       = rootmtype;
629cd620004SJunchao Zhang   link->leafmtype       = leafmtype;
630cd620004SJunchao Zhang   link->rootmtype_mpi   = rootmtype_mpi;
631cd620004SJunchao Zhang   link->leafmtype_mpi   = leafmtype_mpi;
632cd620004SJunchao Zhang 
633cd620004SJunchao Zhang   link->next            = bas->inuse;
634cd620004SJunchao Zhang   bas->inuse            = link;
635cd620004SJunchao Zhang   *mylink               = link;
636cd620004SJunchao Zhang   PetscFunctionReturn(0);
637cd620004SJunchao Zhang }
638cd620004SJunchao Zhang 
639cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
640cd620004SJunchao Zhang    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
641cd620004SJunchao Zhang */
642cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
643cd620004SJunchao Zhang {
644cd620004SJunchao Zhang   PetscErrorCode       ierr;
645cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
646cd620004SJunchao Zhang   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
647cd620004SJunchao Zhang   const PetscInt       *rootoffset,*leafoffset;
648cd620004SJunchao Zhang   PetscMPIInt          n;
649cd620004SJunchao Zhang   MPI_Aint             disp;
650cd620004SJunchao Zhang   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
651cd620004SJunchao Zhang   MPI_Datatype         unit = link->unit;
652cd620004SJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
653cd620004SJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
654cd620004SJunchao Zhang 
655cd620004SJunchao Zhang   PetscFunctionBegin;
656cd620004SJunchao Zhang   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
657cd620004SJunchao Zhang   if (sf->persistent) {
658cd620004SJunchao Zhang     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
659cd620004SJunchao Zhang       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
660cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
661cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
662cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
663cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
664cd620004SJunchao Zhang           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
665cd620004SJunchao Zhang         }
666cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
667cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
668cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
669cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
670cd620004SJunchao Zhang           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
671cd620004SJunchao Zhang         }
672cd620004SJunchao Zhang       }
673cd620004SJunchao Zhang       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
674cd620004SJunchao Zhang     }
675cd620004SJunchao Zhang 
676cd620004SJunchao Zhang     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
677cd620004SJunchao Zhang       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
678cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
679cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
680cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
681cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
682cd620004SJunchao Zhang           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
683cd620004SJunchao Zhang         }
684cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
685cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
686cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
687cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
688cd620004SJunchao Zhang           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
689cd620004SJunchao Zhang         }
690cd620004SJunchao Zhang       }
691cd620004SJunchao Zhang       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
692cd620004SJunchao Zhang     }
693cd620004SJunchao Zhang   }
694cd620004SJunchao Zhang   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
695cd620004SJunchao Zhang   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
696cd620004SJunchao Zhang   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
697cd620004SJunchao Zhang   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
698cd620004SJunchao Zhang   PetscFunctionReturn(0);
699cd620004SJunchao Zhang }
700cd620004SJunchao Zhang 
701cd620004SJunchao Zhang 
702cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
703cd620004SJunchao Zhang {
704cd620004SJunchao Zhang   PetscErrorCode    ierr;
705cd620004SJunchao Zhang   PetscSFLink       link,*p;
70640e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
70740e23c03SJunchao Zhang 
70840e23c03SJunchao Zhang   PetscFunctionBegin;
70940e23c03SJunchao Zhang   /* Look for types in cache */
71040e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
71140e23c03SJunchao Zhang     PetscBool match;
71240e23c03SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
713637e6665SJunchao Zhang     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
71440e23c03SJunchao Zhang       switch (cmode) {
71540e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
71640e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
71740e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
71840e23c03SJunchao Zhang       }
71940e23c03SJunchao Zhang       *mylink = link;
72040e23c03SJunchao Zhang       PetscFunctionReturn(0);
72140e23c03SJunchao Zhang     }
72240e23c03SJunchao Zhang   }
72340e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
72440e23c03SJunchao Zhang   PetscFunctionReturn(0);
72540e23c03SJunchao Zhang }
72640e23c03SJunchao Zhang 
727cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
72840e23c03SJunchao Zhang {
72940e23c03SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
73040e23c03SJunchao Zhang 
73140e23c03SJunchao Zhang   PetscFunctionBegin;
732637e6665SJunchao Zhang   (*link)->rootdata = NULL;
733637e6665SJunchao Zhang   (*link)->leafdata = NULL;
73440e23c03SJunchao Zhang   (*link)->next     = bas->avail;
73540e23c03SJunchao Zhang   bas->avail        = *link;
73640e23c03SJunchao Zhang   *link             = NULL;
73740e23c03SJunchao Zhang   PetscFunctionReturn(0);
73840e23c03SJunchao Zhang }
73940e23c03SJunchao Zhang 
740cd620004SJunchao Zhang /* Destroy all links chained in 'avail' */
741cd620004SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
742eb02082bSJunchao Zhang {
743eb02082bSJunchao Zhang   PetscErrorCode    ierr;
744cd620004SJunchao Zhang   PetscSFLink       link = *avail,next;
745cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
746cd620004SJunchao Zhang   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
747eb02082bSJunchao Zhang 
748eb02082bSJunchao Zhang   PetscFunctionBegin;
749eb02082bSJunchao Zhang   for (; link; link=next) {
750eb02082bSJunchao Zhang     next = link->next;
751eb02082bSJunchao Zhang     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
752cd620004SJunchao Zhang     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
753eb02082bSJunchao Zhang       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
754eb02082bSJunchao Zhang     }
755eb02082bSJunchao Zhang     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
756cd620004SJunchao Zhang     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
75751ccb202SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
758cd620004SJunchao Zhang       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
759cd620004SJunchao Zhang       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
760eb02082bSJunchao Zhang       if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
761eb02082bSJunchao Zhang #endif
762cd620004SJunchao Zhang       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
763cd620004SJunchao Zhang       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
764cd620004SJunchao Zhang     }
765eb02082bSJunchao Zhang     ierr = PetscFree(link);CHKERRQ(ierr);
766eb02082bSJunchao Zhang   }
767eb02082bSJunchao Zhang   *avail = NULL;
768eb02082bSJunchao Zhang   PetscFunctionReturn(0);
769eb02082bSJunchao Zhang }
770eb02082bSJunchao Zhang 
771cd620004SJunchao Zhang #if defined(PETSC_USE_DEBUG)
7729d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
773cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
7749d1c8addSJunchao Zhang {
7759d1c8addSJunchao Zhang   PetscErrorCode    ierr;
776cd620004SJunchao Zhang   PetscSFLink       link,*p;
7779d1c8addSJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
7789d1c8addSJunchao Zhang   PetscBool         match;
7799d1c8addSJunchao Zhang 
7809d1c8addSJunchao Zhang   PetscFunctionBegin;
78118fb5014SJunchao Zhang   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
78218fb5014SJunchao Zhang      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
78318fb5014SJunchao Zhang   */
784637e6665SJunchao Zhang   if (rootdata || leafdata) {
7859d1c8addSJunchao Zhang     for (p=&bas->inuse; (link=*p); p=&link->next) {
7869d1c8addSJunchao Zhang       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
787cd620004SJunchao Zhang       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
7889d1c8addSJunchao Zhang     }
78918fb5014SJunchao Zhang   }
7909d1c8addSJunchao Zhang   PetscFunctionReturn(0);
7919d1c8addSJunchao Zhang }
792cd620004SJunchao Zhang #endif
7939d1c8addSJunchao Zhang 
794cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
79540e23c03SJunchao Zhang {
79640e23c03SJunchao Zhang   PetscErrorCode ierr;
797b23bfdefSJunchao Zhang   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
798b23bfdefSJunchao Zhang   PetscBool      is2Int,is2PetscInt;
79940e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
80040e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
801b23bfdefSJunchao Zhang   PetscInt       nPetscComplex=0;
80240e23c03SJunchao Zhang #endif
80340e23c03SJunchao Zhang 
80440e23c03SJunchao Zhang   PetscFunctionBegin;
805b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
806b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
807b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
808b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
809b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
810b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
81140e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
812b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
81340e23c03SJunchao Zhang #endif
81440e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
81540e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
816b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
81740e23c03SJunchao Zhang   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
8185ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
819b23bfdefSJunchao Zhang   link->bs = 1; /* default */
82040e23c03SJunchao Zhang 
821eb02082bSJunchao Zhang   if (is2Int) {
822cd620004SJunchao Zhang     PackInit_PairType_int_int_1_1(link);
823eb02082bSJunchao Zhang     link->bs        = 1;
824eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(int);
8255ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
826eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
8275ad15460SJunchao Zhang     link->unit      = MPI_2INT;
828eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
829cd620004SJunchao Zhang     PackInit_PairType_PetscInt_PetscInt_1_1(link);
830eb02082bSJunchao Zhang     link->bs        = 1;
831eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(PetscInt);
832eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
8335ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
8345ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
835eb02082bSJunchao Zhang   } else if (nPetscReal) {
836b23bfdefSJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
837b23bfdefSJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
838b23bfdefSJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
839b23bfdefSJunchao Zhang     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
840b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
841eb02082bSJunchao Zhang     link->unitbytes = nPetscReal*sizeof(PetscReal);
84240e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
8435ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
844b23bfdefSJunchao Zhang   } else if (nPetscInt) {
845b23bfdefSJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
846b23bfdefSJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
847b23bfdefSJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
848b23bfdefSJunchao Zhang     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
849b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
850eb02082bSJunchao Zhang     link->unitbytes = nPetscInt*sizeof(PetscInt);
851b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
8525ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
853b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
854b23bfdefSJunchao Zhang   } else if (nInt) {
855b23bfdefSJunchao Zhang     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
856b23bfdefSJunchao Zhang     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
857b23bfdefSJunchao Zhang     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
858b23bfdefSJunchao Zhang     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
859b23bfdefSJunchao Zhang     link->bs        = nInt;
860eb02082bSJunchao Zhang     link->unitbytes = nInt*sizeof(int);
861b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
8625ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
863b23bfdefSJunchao Zhang #endif
864b23bfdefSJunchao Zhang   } else if (nSignedChar) {
865b23bfdefSJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
866b23bfdefSJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
867b23bfdefSJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
868b23bfdefSJunchao Zhang     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
869b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
870eb02082bSJunchao Zhang     link->unitbytes = nSignedChar*sizeof(SignedChar);
871b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
8725ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
873b23bfdefSJunchao Zhang   }  else if (nUnsignedChar) {
874b23bfdefSJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
875b23bfdefSJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
876b23bfdefSJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
877b23bfdefSJunchao Zhang     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
878b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
879eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
880b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
8815ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
88240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
883b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
884b23bfdefSJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
885b23bfdefSJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
886b23bfdefSJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
887b23bfdefSJunchao Zhang     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
888b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
889eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
89040e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
8915ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
89240e23c03SJunchao Zhang #endif
89340e23c03SJunchao Zhang   } else {
894b23bfdefSJunchao Zhang     MPI_Aint lb,nbyte;
895b23bfdefSJunchao Zhang     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
89640e23c03SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
897eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
898eb02082bSJunchao Zhang       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
899eb02082bSJunchao Zhang       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
900eb02082bSJunchao Zhang       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
901eb02082bSJunchao Zhang       link->bs        = nbyte;
902b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
903b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
90440e23c03SJunchao Zhang     } else {
905eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
906eb02082bSJunchao Zhang       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
907eb02082bSJunchao Zhang       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
908eb02082bSJunchao Zhang       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
909eb02082bSJunchao Zhang       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
910eb02082bSJunchao Zhang       link->bs        = nInt;
911b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
91240e23c03SJunchao Zhang       link->basicunit = MPI_INT;
91340e23c03SJunchao Zhang     }
9145ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
91540e23c03SJunchao Zhang   }
916b23bfdefSJunchao Zhang 
9175ad15460SJunchao Zhang   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
91840e23c03SJunchao Zhang   PetscFunctionReturn(0);
91940e23c03SJunchao Zhang }
92040e23c03SJunchao Zhang 
921fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
92240e23c03SJunchao Zhang {
92340e23c03SJunchao Zhang   PetscFunctionBegin;
92440e23c03SJunchao Zhang   *UnpackAndOp = NULL;
925eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
926eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
927eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
928eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
929eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
930eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
931eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
932eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
933eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
934eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
935eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
936eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
937eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
938eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
939eb02082bSJunchao Zhang   }
940eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
941eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
942eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
943eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
944eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
945eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
946eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
947eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
948eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
949eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
950eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
951eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
952eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
953eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
954eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
955eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
956eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
957eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
958eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
959eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
960eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
961eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
962eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
963eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
964eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
965eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
966eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
967eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
968eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
969eb02082bSJunchao Zhang   }
970eb02082bSJunchao Zhang #endif
97140e23c03SJunchao Zhang   PetscFunctionReturn(0);
97240e23c03SJunchao Zhang }
97340e23c03SJunchao Zhang 
974fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
97540e23c03SJunchao Zhang {
97640e23c03SJunchao Zhang   PetscFunctionBegin;
977cd620004SJunchao Zhang   *ScatterAndOp = NULL;
978eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
979cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
980cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
981cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
982cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
983cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
984cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
985cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
986cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
987cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
988cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
989cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
990cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
991cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
992eb02082bSJunchao Zhang   }
993eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
994eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
995cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
996cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
997cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
998cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
999cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1000cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1001cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1002cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1003cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1004cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1005cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1006cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1007cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1008eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1009cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1010cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1011cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1012cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1013cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1014cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1015cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1016cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1017cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1018cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1019cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1020cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1021cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1022eb02082bSJunchao Zhang   }
1023eb02082bSJunchao Zhang #endif
1024cd620004SJunchao Zhang   PetscFunctionReturn(0);
1025cd620004SJunchao Zhang }
1026cd620004SJunchao Zhang 
1027fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1028cd620004SJunchao Zhang {
1029cd620004SJunchao Zhang   PetscFunctionBegin;
1030cd620004SJunchao Zhang   *FetchAndOp = NULL;
1031cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1032cd620004SJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1033cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1034cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1035cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1036cd620004SJunchao Zhang #endif
1037cd620004SJunchao Zhang   PetscFunctionReturn(0);
1038cd620004SJunchao Zhang }
1039cd620004SJunchao Zhang 
1040fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1041cd620004SJunchao Zhang {
1042cd620004SJunchao Zhang   PetscFunctionBegin;
1043cd620004SJunchao Zhang   *FetchAndOpLocal = NULL;
1044cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1045cd620004SJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1046cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1047cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1048cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1049cd620004SJunchao Zhang #endif
1050cd620004SJunchao Zhang   PetscFunctionReturn(0);
1051cd620004SJunchao Zhang }
1052cd620004SJunchao Zhang 
1053cd620004SJunchao Zhang /*=============================================================================
1054cd620004SJunchao Zhang               A set of helper routines for Pack/Unpack/Scatter on GPUs
1055cd620004SJunchao Zhang  ============================================================================*/
1056cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1057cd620004SJunchao Zhang /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
1058cd620004SJunchao Zhang    make sure the data is ready for packing.
1059cd620004SJunchao Zhang  */
1060cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
1061cd620004SJunchao Zhang {
1062cd620004SJunchao Zhang   PetscFunctionBegin;
1063cd620004SJunchao Zhang   if (sf->use_default_stream) PetscFunctionReturn(0);
1064cd620004SJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {
1065cd620004SJunchao Zhang     cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
1066cd620004SJunchao Zhang   }
1067cd620004SJunchao Zhang   PetscFunctionReturn(0);
1068cd620004SJunchao Zhang }
1069cd620004SJunchao Zhang 
1070cd620004SJunchao Zhang /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
1071cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
1072cd620004SJunchao Zhang {
1073cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1074cd620004SJunchao Zhang 
1075cd620004SJunchao Zhang   PetscFunctionBegin;
1076b85e67b7SJunchao Zhang   /* Do nothing if we use stream aware mpi || has nothing for remote */
1077b85e67b7SJunchao Zhang   if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
1078b85e67b7SJunchao Zhang   /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */
1079c2a741eeSJunchao Zhang   if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1080cd620004SJunchao Zhang     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1081cd620004SJunchao Zhang   }
1082cd620004SJunchao Zhang   PetscFunctionReturn(0);
1083cd620004SJunchao Zhang }
1084cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
1085cd620004SJunchao Zhang {
1086cd620004SJunchao Zhang   PetscFunctionBegin;
1087b85e67b7SJunchao Zhang   /* See comments above */
1088b85e67b7SJunchao Zhang   if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
1089c2a741eeSJunchao Zhang   if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1090cd620004SJunchao Zhang     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1091cd620004SJunchao Zhang   }
1092cd620004SJunchao Zhang   PetscFunctionReturn(0);
1093cd620004SJunchao Zhang }
1094cd620004SJunchao Zhang 
1095cd620004SJunchao Zhang /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
1096cd620004SJunchao Zhang    does not know which stream the callers will use.
1097cd620004SJunchao Zhang */
1098cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
1099cd620004SJunchao Zhang {
1100cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1101*4a314419SJunchao Zhang   PetscBool      host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
1102cd620004SJunchao Zhang 
1103cd620004SJunchao Zhang   PetscFunctionBegin;
1104*4a314419SJunchao Zhang   /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */
1105*4a314419SJunchao Zhang   if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0);
1106*4a314419SJunchao Zhang 
1107*4a314419SJunchao Zhang   /* If rootmtype is HOST or DEVICE:
1108*4a314419SJunchao Zhang      If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it;
1109*4a314419SJunchao Zhang      If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect,
1110*4a314419SJunchao Zhang      i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync).
1111*4a314419SJunchao Zhang 
1112*4a314419SJunchao Zhang      Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise
1113*4a314419SJunchao Zhang      CPU thread might use the yet-to-be-updated rootdata pending in the stream.
1114b85e67b7SJunchao Zhang    */
1115b85e67b7SJunchao Zhang   if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {
1116cd620004SJunchao Zhang     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1117cd620004SJunchao Zhang   }
1118cd620004SJunchao Zhang   PetscFunctionReturn(0);
1119cd620004SJunchao Zhang }
1120cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
1121cd620004SJunchao Zhang {
1122*4a314419SJunchao Zhang   PetscBool      host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
1123*4a314419SJunchao Zhang 
1124cd620004SJunchao Zhang   PetscFunctionBegin;
1125*4a314419SJunchao Zhang   /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/
1126*4a314419SJunchao Zhang   if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0);
1127b85e67b7SJunchao Zhang   if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {
1128cd620004SJunchao Zhang     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1129cd620004SJunchao Zhang   }
1130cd620004SJunchao Zhang   PetscFunctionReturn(0);
1131cd620004SJunchao Zhang }
1132cd620004SJunchao Zhang 
1133cd620004SJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
1134cd620004SJunchao Zhang    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
1135cd620004SJunchao Zhang */
1136cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1137cd620004SJunchao Zhang {
1138cd620004SJunchao Zhang   PetscErrorCode ierr;
1139cd620004SJunchao Zhang   cudaError_t    cerr;
1140cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1141cd620004SJunchao Zhang 
1142cd620004SJunchao Zhang   PetscFunctionBegin;
1143cd620004SJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
1144cd620004SJunchao Zhang     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1145cd620004SJunchao Zhang     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1146cd620004SJunchao Zhang     size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
1147cd620004SJunchao Zhang     if (device2host) {
1148cd620004SJunchao Zhang       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1149cd620004SJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1150cd620004SJunchao Zhang     } else {
1151cd620004SJunchao Zhang       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1152cd620004SJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1153cd620004SJunchao Zhang     }
1154cd620004SJunchao Zhang   }
1155cd620004SJunchao Zhang   PetscFunctionReturn(0);
1156cd620004SJunchao Zhang }
1157cd620004SJunchao Zhang 
1158cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1159cd620004SJunchao Zhang {
1160cd620004SJunchao Zhang   PetscErrorCode ierr;
1161cd620004SJunchao Zhang   cudaError_t    cerr;
1162cd620004SJunchao Zhang 
1163cd620004SJunchao Zhang   PetscFunctionBegin;
1164cd620004SJunchao Zhang   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
1165cd620004SJunchao Zhang     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1166cd620004SJunchao Zhang     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1167cd620004SJunchao Zhang     size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
1168cd620004SJunchao Zhang     if (device2host) {
1169cd620004SJunchao Zhang       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1170cd620004SJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1171cd620004SJunchao Zhang     } else {
1172cd620004SJunchao Zhang       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1173cd620004SJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1174cd620004SJunchao Zhang     }
1175cd620004SJunchao Zhang   }
1176cd620004SJunchao Zhang   PetscFunctionReturn(0);
1177cd620004SJunchao Zhang }
1178cd620004SJunchao Zhang #else
1179cd620004SJunchao Zhang 
1180cd620004SJunchao Zhang #define PetscSFLinkSyncDeviceBeforePackData(a,b)                0
1181cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterPackRootData(a,b)             0
1182cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterPackLeafData(a,b)             0
1183cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b)           0
1184cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b)           0
1185cd620004SJunchao Zhang #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1186cd620004SJunchao Zhang #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1187cd620004SJunchao Zhang 
1188cd620004SJunchao Zhang #endif
1189cd620004SJunchao Zhang 
1190cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1191cd620004SJunchao Zhang {
1192cd620004SJunchao Zhang   PetscErrorCode ierr;
1193cd620004SJunchao Zhang   PetscLogDouble flops;
1194cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1195cd620004SJunchao Zhang 
1196cd620004SJunchao Zhang 
1197cd620004SJunchao Zhang   PetscFunctionBegin;
1198cd620004SJunchao Zhang   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1199cd620004SJunchao Zhang     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1200cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1201cd620004SJunchao Zhang     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1202cd620004SJunchao Zhang #endif
1203cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1204cd620004SJunchao Zhang   }
1205cd620004SJunchao Zhang   PetscFunctionReturn(0);
1206cd620004SJunchao Zhang }
1207cd620004SJunchao Zhang 
1208cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1209cd620004SJunchao Zhang {
1210cd620004SJunchao Zhang   PetscLogDouble flops;
1211cd620004SJunchao Zhang   PetscErrorCode ierr;
1212cd620004SJunchao Zhang 
1213cd620004SJunchao Zhang   PetscFunctionBegin;
1214cd620004SJunchao Zhang   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1215cd620004SJunchao Zhang     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1216cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1217cd620004SJunchao Zhang     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1218cd620004SJunchao Zhang #endif
1219cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1220cd620004SJunchao Zhang   }
1221cd620004SJunchao Zhang   PetscFunctionReturn(0);
1222cd620004SJunchao Zhang }
1223cd620004SJunchao Zhang 
1224cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1225cd620004SJunchao Zhang   Input Arguments:
1226cd620004SJunchao Zhang   +sf      - The StarForest
1227cd620004SJunchao Zhang   .link    - The link
1228cd620004SJunchao Zhang   .count   - Number of entries to unpack
1229cd620004SJunchao Zhang   .start   - The first index, significent when indices=NULL
1230cd620004SJunchao Zhang   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1231cd620004SJunchao Zhang   .buf     - A contiguous buffer to unpack from
1232cd620004SJunchao Zhang   -op      - Operation after unpack
1233cd620004SJunchao Zhang 
1234cd620004SJunchao Zhang   Output Arguments:
1235cd620004SJunchao Zhang   .data    - The data to unpack to
1236cd620004SJunchao Zhang */
1237cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1238cd620004SJunchao Zhang {
1239cd620004SJunchao Zhang   PetscFunctionBegin;
1240cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1241cd620004SJunchao Zhang   {
1242cd620004SJunchao Zhang     PetscErrorCode ierr;
1243cd620004SJunchao Zhang     PetscInt       i;
1244cd620004SJunchao Zhang     PetscMPIInt    n;
1245cd620004SJunchao Zhang     if (indices) {
1246cd620004SJunchao Zhang       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1247cd620004SJunchao Zhang          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1248cd620004SJunchao Zhang       */
1249cd620004SJunchao Zhang       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
1250cd620004SJunchao Zhang     } else {
1251cd620004SJunchao Zhang       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1252cd620004SJunchao Zhang       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr);
1253cd620004SJunchao Zhang     }
1254cd620004SJunchao Zhang   }
1255cd620004SJunchao Zhang #else
1256cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1257cd620004SJunchao Zhang #endif
1258cd620004SJunchao Zhang   PetscFunctionReturn(0);
1259cd620004SJunchao Zhang }
1260cd620004SJunchao Zhang 
1261fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1262cd620004SJunchao Zhang {
1263cd620004SJunchao Zhang   PetscFunctionBegin;
1264cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1265cd620004SJunchao Zhang   {
1266cd620004SJunchao Zhang     PetscErrorCode ierr;
1267cd620004SJunchao Zhang     PetscInt       i,disp;
1268fcc7397dSJunchao Zhang     if (!srcIdx) {
1269fcc7397dSJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
1270fcc7397dSJunchao Zhang     } else {
1271cd620004SJunchao Zhang       for (i=0; i<count; i++) {
1272fcc7397dSJunchao Zhang         disp = dstIdx? dstIdx[i] : dstStart + i;
1273fcc7397dSJunchao Zhang         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);
1274fcc7397dSJunchao Zhang       }
1275cd620004SJunchao Zhang     }
1276cd620004SJunchao Zhang   }
1277cd620004SJunchao Zhang #else
1278cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1279cd620004SJunchao Zhang #endif
1280cd620004SJunchao Zhang   PetscFunctionReturn(0);
1281cd620004SJunchao Zhang }
1282cd620004SJunchao Zhang 
1283cd620004SJunchao Zhang /*=============================================================================
1284cd620004SJunchao Zhang               Pack/Unpack/Fetch/Scatter routines
1285cd620004SJunchao Zhang  ============================================================================*/
1286cd620004SJunchao Zhang 
1287cd620004SJunchao Zhang /* Pack rootdata to rootbuf
1288cd620004SJunchao Zhang   Input Arguments:
1289cd620004SJunchao Zhang   + sf       - The SF this packing works on.
1290cd620004SJunchao Zhang   . link     - It gives the memtype of the roots and also provides root buffer.
1291cd620004SJunchao Zhang   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1292cd620004SJunchao Zhang   - rootdata - Where to read the roots.
1293cd620004SJunchao Zhang 
1294cd620004SJunchao Zhang   Notes:
1295cd620004SJunchao Zhang   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1296cd620004SJunchao Zhang   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1297cd620004SJunchao Zhang  */
1298cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1299cd620004SJunchao Zhang {
1300cd620004SJunchao Zhang   PetscErrorCode   ierr;
1301cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1302cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1303cd620004SJunchao Zhang   PetscInt         count,start;
1304fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1305cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1306fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1307fcc7397dSJunchao Zhang 
1308cd620004SJunchao Zhang 
1309cd620004SJunchao Zhang   PetscFunctionBegin;
1310cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1311cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1312cd620004SJunchao Zhang   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1313fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1314cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1315fcc7397dSJunchao Zhang     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1316cd620004SJunchao Zhang   }
1317cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1318cd620004SJunchao Zhang     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1319cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1320cd620004SJunchao Zhang   }
1321cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1322cd620004SJunchao Zhang   PetscFunctionReturn(0);
1323cd620004SJunchao Zhang }
1324cd620004SJunchao Zhang 
1325cd620004SJunchao Zhang /* Pack leafdata to leafbuf */
1326cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1327cd620004SJunchao Zhang {
1328cd620004SJunchao Zhang   PetscErrorCode   ierr;
1329cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1330cd620004SJunchao Zhang   PetscInt         count,start;
1331fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1332cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1333fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1334cd620004SJunchao Zhang 
1335cd620004SJunchao Zhang   PetscFunctionBegin;
1336cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1337cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1338cd620004SJunchao Zhang   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1339fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1340cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1341fcc7397dSJunchao Zhang     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1342cd620004SJunchao Zhang   }
1343cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1344cd620004SJunchao Zhang     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1345cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1346cd620004SJunchao Zhang   }
1347cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1348cd620004SJunchao Zhang   PetscFunctionReturn(0);
1349cd620004SJunchao Zhang }
1350cd620004SJunchao Zhang 
1351cd620004SJunchao Zhang /* Unpack rootbuf to rootdata */
1352cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1353cd620004SJunchao Zhang {
1354cd620004SJunchao Zhang   PetscErrorCode   ierr;
1355cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1356cd620004SJunchao Zhang   PetscInt         count,start;
1357cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1358fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1359cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1360fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1361cd620004SJunchao Zhang 
1362cd620004SJunchao Zhang   PetscFunctionBegin;
1363cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1364cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1365cd620004SJunchao Zhang   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1366cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1367cd620004SJunchao Zhang     if (UnpackAndOp) {
1368fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1369fcc7397dSJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1370cd620004SJunchao Zhang     } else {
1371fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1372cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1373cd620004SJunchao Zhang     }
1374cd620004SJunchao Zhang   }
1375cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1376cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1377cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1378cd620004SJunchao Zhang   PetscFunctionReturn(0);
1379cd620004SJunchao Zhang }
1380cd620004SJunchao Zhang 
1381cd620004SJunchao Zhang /* Unpack leafbuf to leafdata */
1382cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1383cd620004SJunchao Zhang {
1384cd620004SJunchao Zhang   PetscErrorCode   ierr;
1385cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1386cd620004SJunchao Zhang   PetscInt         count,start;
1387fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1388cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1389fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1390cd620004SJunchao Zhang 
1391cd620004SJunchao Zhang   PetscFunctionBegin;
1392cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1393cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1394cd620004SJunchao Zhang   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1395cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1396cd620004SJunchao Zhang     if (UnpackAndOp) {
1397fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1398fcc7397dSJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1399cd620004SJunchao Zhang     } else {
1400fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1401cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1402cd620004SJunchao Zhang     }
1403cd620004SJunchao Zhang   }
1404cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1405cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1406cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1407cd620004SJunchao Zhang   PetscFunctionReturn(0);
1408cd620004SJunchao Zhang }
1409cd620004SJunchao Zhang 
1410cd620004SJunchao Zhang /* FetchAndOp rootdata with rootbuf */
1411cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1412cd620004SJunchao Zhang {
1413cd620004SJunchao Zhang   PetscErrorCode     ierr;
1414cd620004SJunchao Zhang   const PetscInt     *rootindices = NULL;
1415cd620004SJunchao Zhang   PetscInt           count,start;
1416cd620004SJunchao Zhang   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1417fcc7397dSJunchao Zhang   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1418cd620004SJunchao Zhang   PetscMemType       rootmtype = link->rootmtype;
1419fcc7397dSJunchao Zhang   PetscSFPackOpt     opt = NULL;
1420cd620004SJunchao Zhang 
1421cd620004SJunchao Zhang   PetscFunctionBegin;
1422cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1423cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1424cd620004SJunchao Zhang   if (bas->rootbuflen[scope]) {
1425cd620004SJunchao Zhang     /* Do FetchAndOp on rootdata with rootbuf */
1426cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1427fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1428fcc7397dSJunchao Zhang     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1429cd620004SJunchao Zhang   }
1430cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1431cd620004SJunchao Zhang     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1432cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1433cd620004SJunchao Zhang   }
1434cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1435cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1436cd620004SJunchao Zhang   PetscFunctionReturn(0);
1437cd620004SJunchao Zhang }
1438cd620004SJunchao Zhang 
1439cd620004SJunchao Zhang /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1440cd620004SJunchao Zhang PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1441cd620004SJunchao Zhang {
1442cd620004SJunchao Zhang   PetscErrorCode       ierr;
1443cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1444cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1445cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1446fcc7397dSJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1447cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1448fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1449cd620004SJunchao Zhang 
1450cd620004SJunchao Zhang   PetscFunctionBegin;
1451cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1452cd620004SJunchao Zhang   if (rootmtype != leafmtype) { /* Uncommon case */
1453cd620004SJunchao Zhang      /* The local communication has to go through pack and unpack */
1454cd620004SJunchao Zhang     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1455f01131f0SJunchao Zhang     ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1456cd620004SJunchao Zhang     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1457cd620004SJunchao Zhang   } else {
1458cd620004SJunchao Zhang     ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1459cd620004SJunchao Zhang     if (ScatterAndOp) {
1460fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1461fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1462fcc7397dSJunchao Zhang       ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1463cd620004SJunchao Zhang     } else {
1464fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1465fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1466fcc7397dSJunchao Zhang       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1467cd620004SJunchao Zhang     }
1468cd620004SJunchao Zhang   }
1469cd620004SJunchao Zhang   PetscFunctionReturn(0);
1470cd620004SJunchao Zhang }
1471cd620004SJunchao Zhang 
1472cd620004SJunchao Zhang /* Reduce leafdata to rootdata locally */
1473cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1474cd620004SJunchao Zhang {
1475cd620004SJunchao Zhang   PetscErrorCode       ierr;
1476cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1477cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1478cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1479fcc7397dSJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1480cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1481fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1482cd620004SJunchao Zhang 
1483cd620004SJunchao Zhang   PetscFunctionBegin;
1484cd620004SJunchao Zhang   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1485cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1486cd620004SJunchao Zhang     /* The local communication has to go through pack and unpack */
1487cd620004SJunchao Zhang     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1488f01131f0SJunchao Zhang     ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1489cd620004SJunchao Zhang     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1490cd620004SJunchao Zhang   } else {
1491cd620004SJunchao Zhang     ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1492cd620004SJunchao Zhang     if (ScatterAndOp) {
1493fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1494fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1495fcc7397dSJunchao Zhang       ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1496cd620004SJunchao Zhang     } else {
1497fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1498fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1499fcc7397dSJunchao Zhang       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1500cd620004SJunchao Zhang     }
1501cd620004SJunchao Zhang   }
1502cd620004SJunchao Zhang   PetscFunctionReturn(0);
1503cd620004SJunchao Zhang }
1504cd620004SJunchao Zhang 
1505cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */
1506cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1507cd620004SJunchao Zhang {
1508cd620004SJunchao Zhang   PetscErrorCode       ierr;
1509cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1510cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1511cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1512fcc7397dSJunchao Zhang   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1513cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1514fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1515cd620004SJunchao Zhang 
1516cd620004SJunchao Zhang   PetscFunctionBegin;
1517cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1518cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1519cd620004SJunchao Zhang    /* The local communication has to go through pack and unpack */
1520cd620004SJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1521cd620004SJunchao Zhang   } else {
1522fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1523fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1524cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1525fcc7397dSJunchao Zhang     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1526cd620004SJunchao Zhang   }
152740e23c03SJunchao Zhang   PetscFunctionReturn(0);
152840e23c03SJunchao Zhang }
152940e23c03SJunchao Zhang 
153040e23c03SJunchao Zhang /*
1531cd620004SJunchao Zhang   Create per-rank pack/unpack optimizations based on indice patterns
153240e23c03SJunchao Zhang 
153340e23c03SJunchao Zhang    Input Parameters:
1534fcc7397dSJunchao Zhang   +  n       - Number of destination ranks
1535eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1536b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
153740e23c03SJunchao Zhang 
153840e23c03SJunchao Zhang    Output Parameters:
1539cd620004SJunchao Zhang   +  opt     - Pack optimizations. NULL if no optimizations.
154040e23c03SJunchao Zhang */
1541cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
154240e23c03SJunchao Zhang {
154340e23c03SJunchao Zhang   PetscErrorCode ierr;
1544fcc7397dSJunchao Zhang   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1545fcc7397dSJunchao Zhang   PetscBool      optimizable = PETSC_TRUE;
154640e23c03SJunchao Zhang   PetscSFPackOpt opt;
154740e23c03SJunchao Zhang 
154840e23c03SJunchao Zhang   PetscFunctionBegin;
1549fcc7397dSJunchao Zhang   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1550fcc7397dSJunchao Zhang   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1551fcc7397dSJunchao Zhang   opt->n      = opt->array[0] = n;
1552fcc7397dSJunchao Zhang   opt->offset = opt->array + 1;
1553fcc7397dSJunchao Zhang   opt->start  = opt->array + n   + 2;
1554fcc7397dSJunchao Zhang   opt->dx     = opt->array + 2*n + 2;
1555fcc7397dSJunchao Zhang   opt->dy     = opt->array + 3*n + 2;
1556fcc7397dSJunchao Zhang   opt->dz     = opt->array + 4*n + 2;
1557fcc7397dSJunchao Zhang   opt->X      = opt->array + 5*n + 2;
1558fcc7397dSJunchao Zhang   opt->Y      = opt->array + 6*n + 2;
1559fcc7397dSJunchao Zhang 
1560fcc7397dSJunchao Zhang   for (r=0; r<n; r++) { /* For each destination rank */
1561fcc7397dSJunchao Zhang     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1562fcc7397dSJunchao Zhang     p     = offset[r];
1563fcc7397dSJunchao Zhang     start = idx[p]; /* First index for this rank */
1564fcc7397dSJunchao Zhang     p++;
1565fcc7397dSJunchao Zhang 
1566fcc7397dSJunchao Zhang     /* Search in X dimension */
1567fcc7397dSJunchao Zhang     for (dx=1; dx<m; dx++,p++) {
1568fcc7397dSJunchao Zhang       if (start+dx != idx[p]) break;
1569b23bfdefSJunchao Zhang     }
1570b23bfdefSJunchao Zhang 
1571fcc7397dSJunchao Zhang     dydz = m/dx;
1572fcc7397dSJunchao Zhang     X    = dydz > 1 ? (idx[p]-start) : dx;
1573fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1574fcc7397dSJunchao Zhang     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1575fcc7397dSJunchao Zhang     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1576fcc7397dSJunchao Zhang       for (i=0; i<dx; i++,p++) {
1577fcc7397dSJunchao Zhang         if (start+X*dy+i != idx[p]) {
1578fcc7397dSJunchao Zhang           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1579fcc7397dSJunchao Zhang           else goto Z_dimension;
158040e23c03SJunchao Zhang         }
158140e23c03SJunchao Zhang       }
158240e23c03SJunchao Zhang     }
158340e23c03SJunchao Zhang 
1584fcc7397dSJunchao Zhang Z_dimension:
1585fcc7397dSJunchao Zhang     dz = m/(dx*dy);
1586fcc7397dSJunchao Zhang     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1587fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1588fcc7397dSJunchao Zhang     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1589fcc7397dSJunchao Zhang     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1590fcc7397dSJunchao Zhang       for (j=0; j<dy; j++) {
1591fcc7397dSJunchao Zhang         for (i=0; i<dx; i++,p++) {
1592fcc7397dSJunchao Zhang           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
159340e23c03SJunchao Zhang         }
159440e23c03SJunchao Zhang       }
159540e23c03SJunchao Zhang     }
1596fcc7397dSJunchao Zhang     opt->start[r] = start;
1597fcc7397dSJunchao Zhang     opt->dx[r]    = dx;
1598fcc7397dSJunchao Zhang     opt->dy[r]    = dy;
1599fcc7397dSJunchao Zhang     opt->dz[r]    = dz;
1600fcc7397dSJunchao Zhang     opt->X[r]     = X;
1601fcc7397dSJunchao Zhang     opt->Y[r]     = Y;
160240e23c03SJunchao Zhang   }
160340e23c03SJunchao Zhang 
1604fcc7397dSJunchao Zhang finish:
1605fcc7397dSJunchao Zhang   /* If not optimizable, free arrays to save memory */
1606fcc7397dSJunchao Zhang   if (!n || !optimizable) {
1607fcc7397dSJunchao Zhang     ierr = PetscFree(opt->array);CHKERRQ(ierr);
160840e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
160940e23c03SJunchao Zhang     *out = NULL;
1610fcc7397dSJunchao Zhang   } else {
1611fcc7397dSJunchao Zhang     opt->offset[0] = 0;
1612fcc7397dSJunchao Zhang     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1613fcc7397dSJunchao Zhang     *out = opt;
1614fcc7397dSJunchao Zhang   }
161540e23c03SJunchao Zhang   PetscFunctionReturn(0);
161640e23c03SJunchao Zhang }
161740e23c03SJunchao Zhang 
1618fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out)
161940e23c03SJunchao Zhang {
162040e23c03SJunchao Zhang   PetscErrorCode ierr;
162140e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
162240e23c03SJunchao Zhang 
162340e23c03SJunchao Zhang   PetscFunctionBegin;
162440e23c03SJunchao Zhang   if (opt) {
1625fcc7397dSJunchao Zhang     if (mtype == PETSC_MEMTYPE_HOST) {ierr = PetscFree(opt->array);CHKERRQ(ierr);}
1626fcc7397dSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1627fcc7397dSJunchao Zhang     else {cudaError_t cerr = cudaFree(opt->array);CHKERRCUDA(cerr);opt->array=NULL;}
1628fcc7397dSJunchao Zhang #endif
162940e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
163040e23c03SJunchao Zhang     *out = NULL;
163140e23c03SJunchao Zhang   }
163240e23c03SJunchao Zhang   PetscFunctionReturn(0);
163340e23c03SJunchao Zhang }
1634cd620004SJunchao Zhang 
1635cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1636cd620004SJunchao Zhang {
1637cd620004SJunchao Zhang   PetscErrorCode ierr;
1638cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1639cd620004SJunchao Zhang   PetscInt       i,j;
1640cd620004SJunchao Zhang 
1641cd620004SJunchao Zhang   PetscFunctionBegin;
1642cd620004SJunchao Zhang   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1643cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Set defaults */
1644cd620004SJunchao Zhang     sf->leafstart[i]   = 0;
1645cd620004SJunchao Zhang     sf->leafcontig[i]  = PETSC_TRUE;
1646cd620004SJunchao Zhang     sf->leafdups[i]    = PETSC_FALSE;
1647cd620004SJunchao Zhang     bas->rootstart[i]  = 0;
1648cd620004SJunchao Zhang     bas->rootcontig[i] = PETSC_TRUE;
1649cd620004SJunchao Zhang     bas->rootdups[i]   = PETSC_FALSE;
1650cd620004SJunchao Zhang   }
1651cd620004SJunchao Zhang 
1652cd620004SJunchao Zhang   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1653cd620004SJunchao Zhang   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1654cd620004SJunchao Zhang 
1655cd620004SJunchao Zhang   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1656cd620004SJunchao Zhang   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1657cd620004SJunchao Zhang 
1658cd620004SJunchao Zhang   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1659cd620004SJunchao Zhang   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1660cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1661cd620004SJunchao Zhang   }
1662cd620004SJunchao Zhang   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1663cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1664cd620004SJunchao Zhang   }
1665cd620004SJunchao Zhang 
1666cd620004SJunchao Zhang   /* If not, see if we can have per-rank optimizations by doing index analysis */
1667cd620004SJunchao Zhang   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1668cd620004SJunchao Zhang   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1669cd620004SJunchao Zhang 
1670cd620004SJunchao Zhang   /* Are root indices for self and remote contiguous? */
1671cd620004SJunchao Zhang   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1672cd620004SJunchao Zhang   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1673cd620004SJunchao Zhang 
1674cd620004SJunchao Zhang   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1675cd620004SJunchao Zhang   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1676cd620004SJunchao Zhang 
1677cd620004SJunchao Zhang   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1678cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1679cd620004SJunchao Zhang   }
1680cd620004SJunchao Zhang   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1681cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1682cd620004SJunchao Zhang   }
1683cd620004SJunchao Zhang 
1684cd620004SJunchao Zhang   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1685cd620004SJunchao Zhang   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1686cd620004SJunchao Zhang 
1687cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1688cd620004SJunchao Zhang     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1689cd620004SJunchao Zhang   if (!sf->leafcontig[0])  {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1690cd620004SJunchao Zhang   if (!sf->leafcontig[1])  {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1691cd620004SJunchao Zhang   if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1692cd620004SJunchao Zhang   if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1693cd620004SJunchao Zhang #endif
1694cd620004SJunchao Zhang 
1695cd620004SJunchao Zhang   PetscFunctionReturn(0);
1696cd620004SJunchao Zhang }
1697cd620004SJunchao Zhang 
1698cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1699cd620004SJunchao Zhang {
1700cd620004SJunchao Zhang   PetscErrorCode ierr;
1701cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1702cd620004SJunchao Zhang   PetscInt       i;
1703cd620004SJunchao Zhang 
1704cd620004SJunchao Zhang   PetscFunctionBegin;
1705cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1706fcc7397dSJunchao Zhang     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
1707fcc7397dSJunchao Zhang     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
1708fcc7397dSJunchao Zhang     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
1709fcc7397dSJunchao Zhang     ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
1710cd620004SJunchao Zhang   }
1711cd620004SJunchao Zhang   PetscFunctionReturn(0);
1712cd620004SJunchao Zhang }
1713