xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision f01131f0670950e9d98c29f16b982bc6d3023227)
140e23c03SJunchao Zhang 
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
5eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
6eb02082bSJunchao Zhang #include <cuda_runtime.h>
7eb02082bSJunchao Zhang #endif
840e23c03SJunchao Zhang /*
940e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
1040e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1140e23c03SJunchao Zhang  */
1240e23c03SJunchao Zhang 
13cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
1440e23c03SJunchao Zhang 
15cd620004SJunchao Zhang /* Operations working like s += t */
16cd620004SJunchao Zhang #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
17cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
18cd620004SJunchao Zhang #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
19cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while(0)
20cd620004SJunchao Zhang /* Ref MPI MAXLOC */
21cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \
22cd620004SJunchao Zhang   do {                                       \
23cd620004SJunchao Zhang     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
24cd620004SJunchao Zhang     else if (!((s).u op (t).u)) s = t;           \
25cd620004SJunchao Zhang   } while(0)
2640e23c03SJunchao Zhang 
2740e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
2840e23c03SJunchao Zhang 
2940e23c03SJunchao Zhang    Arguments of the macro:
30b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
31b23bfdefSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bs.
32b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
3340e23c03SJunchao Zhang 
3440e23c03SJunchao Zhang    Arguments of the Pack routine:
35cd620004SJunchao Zhang    +count     Number of indices in idx[].
36cd620004SJunchao Zhang    .start     If indices are contiguous, it is the first index; otherwise, not used.
37cd620004SJunchao Zhang    .idx       Indices of entries to packed. NULL means contiguous indices, that is [start,start+count).
38eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39cd620004SJunchao Zhang    .opt       Per-remote pack optimization plan. NULL means no such plan.
40cd620004SJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
41cd620004SJunchao Zhang    -packed    Address of the packed data.
4240e23c03SJunchao Zhang  */
43b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \
44cd620004SJunchao Zhang   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,const void *unpacked,void *packed) \
45b23bfdefSJunchao Zhang   {                                                                                                          \
4640e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
47b23bfdefSJunchao Zhang     const Type     *u = (const Type*)unpacked,*u2;                                                           \
48b23bfdefSJunchao Zhang     Type           *p = (Type*)packed,*p2;                                                                   \
49eb02082bSJunchao Zhang     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
50b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
51b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
5240e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
53cd620004SJunchao Zhang     if (!idx) {ierr = PetscArraycpy(p,u+start*bs,MBS*count);CHKERRQ(ierr);}  /* Indices are contiguous */    \
54b23bfdefSJunchao Zhang     else if (!opt) { /* No optimizations available */                                                        \
55b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
56eb02082bSJunchao Zhang         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
57eb02082bSJunchao Zhang           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
58b23bfdefSJunchao Zhang             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
59b23bfdefSJunchao Zhang     } else {                                                                                                 \
60b23bfdefSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
61b23bfdefSJunchao Zhang         p2  = p + opt->offset[r]*MBS;                                                                        \
62b23bfdefSJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
63b23bfdefSJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
64b23bfdefSJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
65b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
66b23bfdefSJunchao Zhang               for (k=0; k<BS; k++)                                                                           \
67b23bfdefSJunchao Zhang                 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k];                                                    \
68b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
6940e23c03SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
70b23bfdefSJunchao Zhang             u2   = u + idx[opt->copy_start[i]]*MBS;                                                          \
71b23bfdefSJunchao Zhang             l    = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */                       \
72b23bfdefSJunchao Zhang             ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr);                                                     \
73b23bfdefSJunchao Zhang             p2  += l;                                                                                        \
7440e23c03SJunchao Zhang           }                                                                                                  \
75b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
76b23bfdefSJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
7740e23c03SJunchao Zhang           step = opt->stride_step[r];                                                                        \
7840e23c03SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
79b23bfdefSJunchao Zhang             for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j];                                            \
80b23bfdefSJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
8140e23c03SJunchao Zhang       }                                                                                                      \
8240e23c03SJunchao Zhang     }                                                                                                        \
8340e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
8440e23c03SJunchao Zhang   }
8540e23c03SJunchao Zhang 
86cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
87cd620004SJunchao Zhang                 and inserts into a sparse array.
8840e23c03SJunchao Zhang 
8940e23c03SJunchao Zhang    Arguments:
90b23bfdefSJunchao Zhang   .Type       Type of the data
9140e23c03SJunchao Zhang   .BS         Block size for vectorization
92b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
9340e23c03SJunchao Zhang 
9440e23c03SJunchao Zhang   Notes:
9540e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
9640e23c03SJunchao Zhang  */
97cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ)               \
98cd620004SJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \
99b23bfdefSJunchao Zhang   {                                                                                                          \
10040e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
101b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
102cd620004SJunchao Zhang     const Type     *p = (const Type*)packed,*p2;                                                             \
103eb02082bSJunchao Zhang     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
104b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
105b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
10640e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
107b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
108cd620004SJunchao Zhang       u2 = u + start*bs;                                                                                     \
109cd620004SJunchao Zhang       if (u2 != p) {ierr = PetscArraycpy(u2,p,count*MBS);CHKERRQ(ierr);}                                     \
110b23bfdefSJunchao Zhang     } else if (!opt) { /* No optimizations available */                                                      \
111b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
112b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
113cd620004SJunchao Zhang           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
114b23bfdefSJunchao Zhang     } else {                                                                                                 \
115b23bfdefSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
116b23bfdefSJunchao Zhang         p2 = p + opt->offset[r]*MBS;                                                                         \
117b23bfdefSJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
118b23bfdefSJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
119b23bfdefSJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
120b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
121cd620004SJunchao Zhang               for (k=0; k<BS; k++) u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                 \
122b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
12340e23c03SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
124b23bfdefSJunchao Zhang             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
125b23bfdefSJunchao Zhang             l  = opt->copy_length[i]*MBS;                                                                    \
126b23bfdefSJunchao Zhang             ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr);                                                     \
127b23bfdefSJunchao Zhang             p2 += l;                                                                                         \
12840e23c03SJunchao Zhang           }                                                                                                  \
129b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
130b23bfdefSJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
13140e23c03SJunchao Zhang           step = opt->stride_step[r];                                                                        \
13240e23c03SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
133b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
134cd620004SJunchao Zhang               for (k=0; k<BS; k++) u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                 \
135b23bfdefSJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
13640e23c03SJunchao Zhang       }                                                                                                      \
13740e23c03SJunchao Zhang     }                                                                                                        \
13840e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
13940e23c03SJunchao Zhang   }
14040e23c03SJunchao Zhang 
141cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
14240e23c03SJunchao Zhang 
14340e23c03SJunchao Zhang    Arguments:
144cd620004SJunchao Zhang   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
145b23bfdefSJunchao Zhang   .Type       Type of the data
14640e23c03SJunchao Zhang   .BS         Block size for vectorization
147b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
148cd620004SJunchao Zhang   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
149cd620004SJunchao Zhang   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
15040e23c03SJunchao Zhang  */
151cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
152cd620004SJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \
153b23bfdefSJunchao Zhang   {                                                                                                          \
154cd620004SJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
155cd620004SJunchao Zhang     const Type     *p = (const Type*)packed,*p2;                                                             \
156eb02082bSJunchao Zhang     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
157b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
158b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
15940e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
160b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
161cd620004SJunchao Zhang       u += start*bs;                                                                                         \
162cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
163cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
164cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
165cd620004SJunchao Zhang             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
166cd620004SJunchao Zhang     } else if (!opt) { /* No optimizations available */                                                      \
167cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
168cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
169cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
170cd620004SJunchao Zhang               OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                              \
171cd620004SJunchao Zhang     } else {                                                                                                 \
172cd620004SJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
173cd620004SJunchao Zhang         p2 = p + opt->offset[r]*MBS;                                                                         \
174cd620004SJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
175cd620004SJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
176cd620004SJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
177cd620004SJunchao Zhang             for (j=0; j<M; j++)                                                                              \
178cd620004SJunchao Zhang               for (k=0; k<BS; k++)                                                                           \
179cd620004SJunchao Zhang                 OpApply(Op,u[idx2[i]*MBS+j*BS+k],p2[i*MBS+j*BS+k]);                                          \
180cd620004SJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
181cd620004SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
182cd620004SJunchao Zhang             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
183cd620004SJunchao Zhang             l  = opt->copy_length[i]*MBS;                                                                    \
184cd620004SJunchao Zhang             for (j=0; j<l; j++) OpApply(Op,u2[j],p2[j]);                                                     \
185cd620004SJunchao Zhang             p2 += l;                                                                                         \
186cd620004SJunchao Zhang           }                                                                                                  \
187cd620004SJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
188cd620004SJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
189cd620004SJunchao Zhang           step = opt->stride_step[r];                                                                        \
190cd620004SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
191cd620004SJunchao Zhang             for (j=0; j<M; j++)                                                                              \
192cd620004SJunchao Zhang               for (k=0; k<BS; k++)                                                                           \
193cd620004SJunchao Zhang                 OpApply(Op,u2[i*step*MBS+j*BS+k],p2[i*MBS+j*BS+k]);                                          \
194cd620004SJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
195cd620004SJunchao Zhang       }                                                                                                      \
196cd620004SJunchao Zhang     }                                                                                                        \
197cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
198cd620004SJunchao Zhang   }
199cd620004SJunchao Zhang 
200cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
201cd620004SJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,void *packed) \
202cd620004SJunchao Zhang   {                                                                                                          \
203cd620004SJunchao Zhang     Type           *u = (Type*)unpacked,*p = (Type*)packed,t;                                                \
204cd620004SJunchao Zhang     PetscInt       i,j,k,bs=link->bs;                                                                        \
205cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
206cd620004SJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
207cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
208cd620004SJunchao Zhang     if (!idx) {                                                                                              \
209cd620004SJunchao Zhang       u += start*bs;                                                                                         \
210b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
211b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
212b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
213b23bfdefSJunchao Zhang             t = u[i*MBS+j*BS+k];                                                                             \
214cd620004SJunchao Zhang             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
215cd620004SJunchao Zhang             p[i*MBS+j*BS+k] = t;                                                                             \
21640e23c03SJunchao Zhang           }                                                                                                  \
217cd620004SJunchao Zhang     } else {                                                                                                 \
218b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
219b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
220b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
221b23bfdefSJunchao Zhang               t = u[idx[i]*MBS+j*BS+k];                                                                      \
222cd620004SJunchao Zhang               OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                              \
223cd620004SJunchao Zhang               p[i*MBS+j*BS+k] = t;                                                                           \
224cd620004SJunchao Zhang           }                                                                                                  \
225cd620004SJunchao Zhang     }                                                                                                        \
226cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
227cd620004SJunchao Zhang   }
228cd620004SJunchao Zhang 
229cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
230cd620004SJunchao Zhang   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt startx,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata) \
231cd620004SJunchao Zhang   {                                                                                                          \
232cd620004SJunchao Zhang     const Type     *x = (const Type*)xdata;                                                                  \
233cd620004SJunchao Zhang     Type           *y = (Type*)ydata;                                                                        \
234cd620004SJunchao Zhang     PetscInt       i,j,k,bs = link->bs;                                                                      \
235cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
236cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
237cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
238cd620004SJunchao Zhang     if (!idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use UnpackAndOp instead of ScatterAndOp");\
239cd620004SJunchao Zhang     if (!idy) {                                                                                              \
240cd620004SJunchao Zhang       y += starty*bs;                                                                                        \
241cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
242cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
243cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
244cd620004SJunchao Zhang             OpApply(Op,y[i*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]);                                                \
245cd620004SJunchao Zhang     } else {                                                                                                 \
246cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
247cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
248cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
249cd620004SJunchao Zhang             OpApply(Op,y[idy[i]*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]);                                           \
250cd620004SJunchao Zhang     }                                                                                                        \
251cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
252cd620004SJunchao Zhang   }
253cd620004SJunchao Zhang 
254cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
255cd620004SJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,const PetscInt *rootindices,void *rootdata,PetscInt leafstart,const PetscInt *leafindices,const void *leafdata,void *leafupdate) \
256cd620004SJunchao Zhang   {                                                                                                          \
257cd620004SJunchao Zhang     Type           *x = (Type*)rootdata,*y2 = (Type*)leafupdate;                                             \
258cd620004SJunchao Zhang     const Type     *y = (const Type*)leafdata;                                                               \
259cd620004SJunchao Zhang     PetscInt       i,j,k,bs = link->bs;                                                                      \
260cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
261cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
262cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
263cd620004SJunchao Zhang     if (!rootindices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use FetchAndOp instead of FetchAndOpLocal");\
264cd620004SJunchao Zhang     if (!leafindices) {                                                                                      \
265cd620004SJunchao Zhang       y += leafstart*bs; y2 += leafstart*bs;                                                                 \
266cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
267cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
268cd620004SJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
269cd620004SJunchao Zhang             y2[i*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k];                                                 \
270cd620004SJunchao Zhang             OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[i*MBS+j*BS+k]);                                        \
27140e23c03SJunchao Zhang           }                                                                                                  \
272b23bfdefSJunchao Zhang     } else {                                                                                                 \
273cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
274b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
275b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
276cd620004SJunchao Zhang             y2[leafindices[i]*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k];                                    \
277cd620004SJunchao Zhang             OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[leafindices[i]*MBS+j*BS+k]);                           \
27840e23c03SJunchao Zhang           }                                                                                                  \
27940e23c03SJunchao Zhang     }                                                                                                        \
28040e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
28140e23c03SJunchao Zhang   }
28240e23c03SJunchao Zhang 
28340e23c03SJunchao Zhang 
284b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
285b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ)                                                      \
286b23bfdefSJunchao Zhang   DEF_PackFunc(Type,BS,EQ)                                                        \
287cd620004SJunchao Zhang   DEF_UnpackFunc(Type,BS,EQ)                                                      \
288cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
289cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
290eb02082bSJunchao Zhang     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
291eb02082bSJunchao Zhang     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
292cd620004SJunchao Zhang     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
29340e23c03SJunchao Zhang   }
29440e23c03SJunchao Zhang 
295b23bfdefSJunchao Zhang /* Add, Mult ops */
296b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ)                                                       \
297cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
298cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
299cd620004SJunchao Zhang   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
300cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
301cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
302cd620004SJunchao Zhang   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
303cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
304eb02082bSJunchao Zhang     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
305eb02082bSJunchao Zhang     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
306eb02082bSJunchao Zhang     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
307cd620004SJunchao Zhang     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
308cd620004SJunchao Zhang     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
309cd620004SJunchao Zhang     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
31040e23c03SJunchao Zhang   }
31140e23c03SJunchao Zhang 
312b23bfdefSJunchao Zhang /* Max, Min ops */
313b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ)                                                       \
314cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
315cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
316cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
317cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
318cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
319eb02082bSJunchao Zhang     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
320eb02082bSJunchao Zhang     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
321cd620004SJunchao Zhang     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
322cd620004SJunchao Zhang     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
323b23bfdefSJunchao Zhang   }
324b23bfdefSJunchao Zhang 
325b23bfdefSJunchao Zhang /* Logical ops.
326cd620004SJunchao Zhang   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
32740e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
32840e23c03SJunchao Zhang  */
329b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ)                                                       \
330cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
331cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
332cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
333cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
334cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
335cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
336cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
337eb02082bSJunchao Zhang     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
338eb02082bSJunchao Zhang     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
339eb02082bSJunchao Zhang     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
340cd620004SJunchao Zhang     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
341cd620004SJunchao Zhang     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
342cd620004SJunchao Zhang     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
34340e23c03SJunchao Zhang   }
34440e23c03SJunchao Zhang 
345b23bfdefSJunchao Zhang /* Bitwise ops */
346b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ)                                                       \
347cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
348cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
349cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
350cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
351cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
352cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
353cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
354eb02082bSJunchao Zhang     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
355eb02082bSJunchao Zhang     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
356eb02082bSJunchao Zhang     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
357cd620004SJunchao Zhang     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
358cd620004SJunchao Zhang     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
359cd620004SJunchao Zhang     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
36040e23c03SJunchao Zhang   }
36140e23c03SJunchao Zhang 
362cd620004SJunchao Zhang /* Maxloc, Minloc ops */
363cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ)                                                      \
364cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
365cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
366cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
367cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
368cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
369cd620004SJunchao Zhang     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
370cd620004SJunchao Zhang     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
371cd620004SJunchao Zhang     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
372cd620004SJunchao Zhang     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
37340e23c03SJunchao Zhang   }
37440e23c03SJunchao Zhang 
375b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ)                                               \
376b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
377b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
378b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
379b23bfdefSJunchao Zhang   DEF_Log(Type,BS,EQ)                                                             \
380b23bfdefSJunchao Zhang   DEF_Bit(Type,BS,EQ)                                                             \
381cd620004SJunchao Zhang   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
382b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
383b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
384b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
385b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
386b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
38740e23c03SJunchao Zhang   }
38840e23c03SJunchao Zhang 
389b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ)                                                  \
390b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
391b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
392b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
393cd620004SJunchao Zhang   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
394b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
395b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
396b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
397b23bfdefSJunchao Zhang   }
39840e23c03SJunchao Zhang 
39940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
400b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ)                                               \
401b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
402b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
403cd620004SJunchao Zhang   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
404b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
405b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
406b23bfdefSJunchao Zhang   }
40740e23c03SJunchao Zhang #endif
408b23bfdefSJunchao Zhang 
409b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ)                                                  \
410b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
411cd620004SJunchao Zhang   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
412b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
413b23bfdefSJunchao Zhang   }
414b23bfdefSJunchao Zhang 
415b23bfdefSJunchao Zhang /* Maxloc, Minloc */
416cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ)                                                  \
417cd620004SJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
418cd620004SJunchao Zhang   DEF_Xloc(Type,BS,EQ)                                                            \
419cd620004SJunchao Zhang   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
420cd620004SJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
421cd620004SJunchao Zhang     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
422b23bfdefSJunchao Zhang   }
423b23bfdefSJunchao Zhang 
424b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
425b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
426b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
427b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
428b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
429b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
430b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
431b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
432b23bfdefSJunchao Zhang 
433b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
434b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1)
435b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1)
436b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1)
437b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1)
438b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0)
439b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0)
440b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0)
441b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0)
442b23bfdefSJunchao Zhang #endif
443b23bfdefSJunchao Zhang 
444b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */
445b23bfdefSJunchao Zhang typedef signed char SignedChar;
446b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1)
447b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1)
448b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1)
449b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1)
450b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0)
451b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0)
452b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0)
453b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0)
454b23bfdefSJunchao Zhang 
455b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar;
456b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1)
457b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1)
458b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1)
459b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1)
460b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0)
461b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0)
462b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0)
463b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0)
464b23bfdefSJunchao Zhang 
465b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1)
466b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1)
467b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1)
468b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1)
469b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0)
470b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0)
471b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0)
472b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0)
473b23bfdefSJunchao Zhang 
474b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
475b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1)
476b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1)
477b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1)
478b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1)
479b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0)
480b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0)
481b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0)
482b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0)
483b23bfdefSJunchao Zhang #endif
484b23bfdefSJunchao Zhang 
485cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2
486cd620004SJunchao Zhang typedef struct {int u; int i;}           PairType(int,int);
487cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
488cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1)
489cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1)
490b23bfdefSJunchao Zhang 
491b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */
492b23bfdefSJunchao Zhang DEF_DumbType(char,1,1)
493b23bfdefSJunchao Zhang DEF_DumbType(char,2,1)
494b23bfdefSJunchao Zhang DEF_DumbType(char,4,1)
495b23bfdefSJunchao Zhang DEF_DumbType(char,1,0)
496b23bfdefSJunchao Zhang DEF_DumbType(char,2,0)
497b23bfdefSJunchao Zhang DEF_DumbType(char,4,0)
498b23bfdefSJunchao Zhang 
499eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
500b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1)
501b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1)
502b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1)
503b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1)
504b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0)
505b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0)
506b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0)
507b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0)
50840e23c03SJunchao Zhang 
50940e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
51040e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
51140e23c03SJunchao Zhang {
51240e23c03SJunchao Zhang   int ierr;
51340e23c03SJunchao Zhang   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
51440e23c03SJunchao Zhang   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
51540e23c03SJunchao Zhang   return MPI_SUCCESS;
51640e23c03SJunchao Zhang }
51740e23c03SJunchao Zhang #endif
51840e23c03SJunchao Zhang 
519cd620004SJunchao Zhang /*
520cd620004SJunchao Zhang    The routine Creates a communication link for the given operation. It first looks up its link cache. If
521cd620004SJunchao Zhang    there is a free & suitable one, it uses it. Otherwise it creates a new one.
522cd620004SJunchao Zhang 
523cd620004SJunchao Zhang    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
524cd620004SJunchao Zhang    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
525cd620004SJunchao Zhang    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
526cd620004SJunchao Zhang    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
527cd620004SJunchao Zhang 
528cd620004SJunchao Zhang    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
529cd620004SJunchao Zhang 
530cd620004SJunchao Zhang    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
531cd620004SJunchao Zhang 
532cd620004SJunchao Zhang    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
533cd620004SJunchao Zhang    need pack/unpack data.
534cd620004SJunchao Zhang */
535cd620004SJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
53640e23c03SJunchao Zhang {
53740e23c03SJunchao Zhang   PetscErrorCode    ierr;
538cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
539cd620004SJunchao Zhang   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
540cd620004SJunchao Zhang   PetscSFLink       *p,link;
541cd620004SJunchao Zhang   PetscSFDirection  direction;
542cd620004SJunchao Zhang   MPI_Request       *reqs = NULL;
543cd620004SJunchao Zhang   PetscBool         match,rootdirect[2],leafdirect[2];
544cd620004SJunchao Zhang   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
545cd620004SJunchao Zhang   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
546cd620004SJunchao Zhang 
547cd620004SJunchao Zhang   PetscFunctionBegin;
548cd620004SJunchao Zhang   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
549cd620004SJunchao Zhang 
550cd620004SJunchao Zhang   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
551cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
552cd620004SJunchao Zhang     if (sfop == PETSCSF_BCAST) {
553cd620004SJunchao Zhang       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
554cd620004SJunchao Zhang       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
555cd620004SJunchao Zhang     } else if (sfop == PETSCSF_REDUCE) {
556cd620004SJunchao Zhang       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
557cd620004SJunchao Zhang       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
558cd620004SJunchao Zhang     } else { /* PETSCSF_FETCH */
559cd620004SJunchao Zhang       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
560cd620004SJunchao Zhang       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
561cd620004SJunchao Zhang     }
562cd620004SJunchao Zhang   }
563cd620004SJunchao Zhang 
564cd620004SJunchao Zhang   if (use_gpu_aware_mpi) {
565cd620004SJunchao Zhang     rootmtype_mpi = rootmtype;
566cd620004SJunchao Zhang     leafmtype_mpi = leafmtype;
567cd620004SJunchao Zhang   } else {
568cd620004SJunchao Zhang     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
569cd620004SJunchao Zhang   }
570cd620004SJunchao Zhang   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
571cd620004SJunchao Zhang   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
572cd620004SJunchao Zhang   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
573cd620004SJunchao Zhang 
574cd620004SJunchao Zhang   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
575cd620004SJunchao Zhang   nrootreqs = bas->nrootreqs;
576cd620004SJunchao Zhang   nleafreqs = sf->nleafreqs;
577cd620004SJunchao Zhang 
578cd620004SJunchao Zhang   /* Look for free links in cache */
579cd620004SJunchao Zhang   for (p=&bas->avail; (link=*p); p=&link->next) {
580cd620004SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
581cd620004SJunchao Zhang     if (match) {
582cd620004SJunchao Zhang       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
583cd620004SJunchao Zhang          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
584cd620004SJunchao Zhang       */
585cd620004SJunchao Zhang       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
586cd620004SJunchao Zhang         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
587cd620004SJunchao Zhang         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
588cd620004SJunchao Zhang         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
589cd620004SJunchao Zhang       }
590cd620004SJunchao Zhang       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
591cd620004SJunchao Zhang         reqs = link->leafreqs[direction][leafmtype][1];
592cd620004SJunchao Zhang         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
593cd620004SJunchao Zhang         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
594cd620004SJunchao Zhang       }
595cd620004SJunchao Zhang       *p = link->next; /* Remove from available list */
596cd620004SJunchao Zhang       goto found;
597cd620004SJunchao Zhang     }
598cd620004SJunchao Zhang   }
599cd620004SJunchao Zhang 
600cd620004SJunchao Zhang   ierr = PetscNew(&link);CHKERRQ(ierr);
601cd620004SJunchao Zhang   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
602cd620004SJunchao Zhang   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
603cd620004SJunchao Zhang 
604cd620004SJunchao Zhang   nreqs = (nrootreqs+nleafreqs)*8;
605cd620004SJunchao Zhang   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
606cd620004SJunchao Zhang   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
607cd620004SJunchao Zhang 
608cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Two communication directions */
609cd620004SJunchao Zhang     for (j=0; j<2; j++) { /* Two memory types */
610cd620004SJunchao Zhang       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
611cd620004SJunchao Zhang         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
612cd620004SJunchao Zhang         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
613cd620004SJunchao Zhang       }
614cd620004SJunchao Zhang     }
615cd620004SJunchao Zhang   }
616cd620004SJunchao Zhang 
617cd620004SJunchao Zhang found:
618cd620004SJunchao Zhang   if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);}
619cd620004SJunchao Zhang 
620cd620004SJunchao Zhang   /* Allocate buffers along root/leafdata */
621cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
622cd620004SJunchao Zhang     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
623cd620004SJunchao Zhang     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
624cd620004SJunchao Zhang     if (bas->rootbuflen[i]) {
625cd620004SJunchao Zhang       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
626cd620004SJunchao Zhang         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
627cd620004SJunchao Zhang       } else { /* Have to have a separate rootbuf */
628cd620004SJunchao Zhang         if (!link->rootbuf_alloc[i][rootmtype]) {
629cd620004SJunchao Zhang           ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
630cd620004SJunchao Zhang         }
631cd620004SJunchao Zhang         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
632cd620004SJunchao Zhang       }
633cd620004SJunchao Zhang     }
634cd620004SJunchao Zhang 
635cd620004SJunchao Zhang     if (sf->leafbuflen[i]) {
636cd620004SJunchao Zhang       if (leafdirect[i]) {
637cd620004SJunchao Zhang         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
638cd620004SJunchao Zhang       } else {
639cd620004SJunchao Zhang         if (!link->leafbuf_alloc[i][leafmtype]) {
640cd620004SJunchao Zhang           ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
641cd620004SJunchao Zhang         }
642cd620004SJunchao Zhang         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
643cd620004SJunchao Zhang       }
644cd620004SJunchao Zhang     }
645cd620004SJunchao Zhang   }
646cd620004SJunchao Zhang 
647cd620004SJunchao Zhang   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
648cd620004SJunchao Zhang   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
649cd620004SJunchao Zhang     if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
650cd620004SJunchao Zhang       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
651cd620004SJunchao Zhang     }
652cd620004SJunchao Zhang     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
653cd620004SJunchao Zhang   }
654cd620004SJunchao Zhang   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
655cd620004SJunchao Zhang     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
656cd620004SJunchao Zhang       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
657cd620004SJunchao Zhang     }
658cd620004SJunchao Zhang     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
659cd620004SJunchao Zhang   }
660cd620004SJunchao Zhang 
661cd620004SJunchao Zhang   /* Set `current` state of the link. They may change between different SF invocations with the same link */
662cd620004SJunchao Zhang   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
663cd620004SJunchao Zhang     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
664cd620004SJunchao Zhang     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
665cd620004SJunchao Zhang   }
666cd620004SJunchao Zhang 
667cd620004SJunchao Zhang   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
668cd620004SJunchao Zhang   link->leafdata = leafdata;
669cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
670cd620004SJunchao Zhang     link->rootdirect[i] = rootdirect[i];
671cd620004SJunchao Zhang     link->leafdirect[i] = leafdirect[i];
672cd620004SJunchao Zhang   }
673cd620004SJunchao Zhang   link->rootdirect_mpi  = rootdirect_mpi;
674cd620004SJunchao Zhang   link->leafdirect_mpi  = leafdirect_mpi;
675cd620004SJunchao Zhang   link->rootmtype       = rootmtype;
676cd620004SJunchao Zhang   link->leafmtype       = leafmtype;
677cd620004SJunchao Zhang   link->rootmtype_mpi   = rootmtype_mpi;
678cd620004SJunchao Zhang   link->leafmtype_mpi   = leafmtype_mpi;
679cd620004SJunchao Zhang 
680cd620004SJunchao Zhang   link->next            = bas->inuse;
681cd620004SJunchao Zhang   bas->inuse            = link;
682cd620004SJunchao Zhang   *mylink               = link;
683cd620004SJunchao Zhang   PetscFunctionReturn(0);
684cd620004SJunchao Zhang }
685cd620004SJunchao Zhang 
686cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
687cd620004SJunchao Zhang    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
688cd620004SJunchao Zhang */
689cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
690cd620004SJunchao Zhang {
691cd620004SJunchao Zhang   PetscErrorCode       ierr;
692cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
693cd620004SJunchao Zhang   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
694cd620004SJunchao Zhang   const PetscInt       *rootoffset,*leafoffset;
695cd620004SJunchao Zhang   PetscMPIInt          n;
696cd620004SJunchao Zhang   MPI_Aint             disp;
697cd620004SJunchao Zhang   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
698cd620004SJunchao Zhang   MPI_Datatype         unit = link->unit;
699cd620004SJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
700cd620004SJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
701cd620004SJunchao Zhang 
702cd620004SJunchao Zhang   PetscFunctionBegin;
703cd620004SJunchao Zhang   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
704cd620004SJunchao Zhang   if (sf->persistent) {
705cd620004SJunchao Zhang     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
706cd620004SJunchao Zhang       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
707cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
708cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
709cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
710cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
711cd620004SJunchao Zhang           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
712cd620004SJunchao Zhang         }
713cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
714cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
715cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
716cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
717cd620004SJunchao Zhang           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
718cd620004SJunchao Zhang         }
719cd620004SJunchao Zhang       }
720cd620004SJunchao Zhang       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
721cd620004SJunchao Zhang     }
722cd620004SJunchao Zhang 
723cd620004SJunchao Zhang     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
724cd620004SJunchao Zhang       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
725cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
726cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
727cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
728cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
729cd620004SJunchao Zhang           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
730cd620004SJunchao Zhang         }
731cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
732cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
733cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
734cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
735cd620004SJunchao Zhang           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
736cd620004SJunchao Zhang         }
737cd620004SJunchao Zhang       }
738cd620004SJunchao Zhang       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
739cd620004SJunchao Zhang     }
740cd620004SJunchao Zhang   }
741cd620004SJunchao Zhang   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
742cd620004SJunchao Zhang   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
743cd620004SJunchao Zhang   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
744cd620004SJunchao Zhang   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
745cd620004SJunchao Zhang   PetscFunctionReturn(0);
746cd620004SJunchao Zhang }
747cd620004SJunchao Zhang 
748cd620004SJunchao Zhang 
749cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
750cd620004SJunchao Zhang {
751cd620004SJunchao Zhang   PetscErrorCode    ierr;
752cd620004SJunchao Zhang   PetscSFLink       link,*p;
75340e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
75440e23c03SJunchao Zhang 
75540e23c03SJunchao Zhang   PetscFunctionBegin;
75640e23c03SJunchao Zhang   /* Look for types in cache */
75740e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
75840e23c03SJunchao Zhang     PetscBool match;
75940e23c03SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
760637e6665SJunchao Zhang     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
76140e23c03SJunchao Zhang       switch (cmode) {
76240e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
76340e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
76440e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
76540e23c03SJunchao Zhang       }
76640e23c03SJunchao Zhang       *mylink = link;
76740e23c03SJunchao Zhang       PetscFunctionReturn(0);
76840e23c03SJunchao Zhang     }
76940e23c03SJunchao Zhang   }
77040e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
77140e23c03SJunchao Zhang   PetscFunctionReturn(0);
77240e23c03SJunchao Zhang }
77340e23c03SJunchao Zhang 
774cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
77540e23c03SJunchao Zhang {
77640e23c03SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
77740e23c03SJunchao Zhang 
77840e23c03SJunchao Zhang   PetscFunctionBegin;
779637e6665SJunchao Zhang   (*link)->rootdata = NULL;
780637e6665SJunchao Zhang   (*link)->leafdata = NULL;
78140e23c03SJunchao Zhang   (*link)->next     = bas->avail;
78240e23c03SJunchao Zhang   bas->avail        = *link;
78340e23c03SJunchao Zhang   *link             = NULL;
78440e23c03SJunchao Zhang   PetscFunctionReturn(0);
78540e23c03SJunchao Zhang }
78640e23c03SJunchao Zhang 
787cd620004SJunchao Zhang /* Destroy all links chained in 'avail' */
788cd620004SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
789eb02082bSJunchao Zhang {
790eb02082bSJunchao Zhang   PetscErrorCode    ierr;
791cd620004SJunchao Zhang   PetscSFLink       link = *avail,next;
792cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
793cd620004SJunchao Zhang   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
794eb02082bSJunchao Zhang 
795eb02082bSJunchao Zhang   PetscFunctionBegin;
796eb02082bSJunchao Zhang   for (; link; link=next) {
797eb02082bSJunchao Zhang     next = link->next;
798eb02082bSJunchao Zhang     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
799cd620004SJunchao Zhang     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
800eb02082bSJunchao Zhang       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
801eb02082bSJunchao Zhang     }
802eb02082bSJunchao Zhang     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
803cd620004SJunchao Zhang     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
80451ccb202SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
805cd620004SJunchao Zhang       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
806cd620004SJunchao Zhang       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
807eb02082bSJunchao Zhang       if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
808eb02082bSJunchao Zhang #endif
809cd620004SJunchao Zhang       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
810cd620004SJunchao Zhang       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
811cd620004SJunchao Zhang     }
812eb02082bSJunchao Zhang     ierr = PetscFree(link);CHKERRQ(ierr);
813eb02082bSJunchao Zhang   }
814eb02082bSJunchao Zhang   *avail = NULL;
815eb02082bSJunchao Zhang   PetscFunctionReturn(0);
816eb02082bSJunchao Zhang }
817eb02082bSJunchao Zhang 
818cd620004SJunchao Zhang #if defined(PETSC_USE_DEBUG)
8199d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
820cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
8219d1c8addSJunchao Zhang {
8229d1c8addSJunchao Zhang   PetscErrorCode    ierr;
823cd620004SJunchao Zhang   PetscSFLink       link,*p;
8249d1c8addSJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
8259d1c8addSJunchao Zhang   PetscBool         match;
8269d1c8addSJunchao Zhang 
8279d1c8addSJunchao Zhang   PetscFunctionBegin;
82818fb5014SJunchao Zhang   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
82918fb5014SJunchao Zhang      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
83018fb5014SJunchao Zhang   */
831637e6665SJunchao Zhang   if (rootdata || leafdata) {
8329d1c8addSJunchao Zhang     for (p=&bas->inuse; (link=*p); p=&link->next) {
8339d1c8addSJunchao Zhang       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
834cd620004SJunchao Zhang       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
8359d1c8addSJunchao Zhang     }
83618fb5014SJunchao Zhang   }
8379d1c8addSJunchao Zhang   PetscFunctionReturn(0);
8389d1c8addSJunchao Zhang }
839cd620004SJunchao Zhang #endif
8409d1c8addSJunchao Zhang 
841cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
84240e23c03SJunchao Zhang {
84340e23c03SJunchao Zhang   PetscErrorCode ierr;
844b23bfdefSJunchao Zhang   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
845b23bfdefSJunchao Zhang   PetscBool      is2Int,is2PetscInt;
84640e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
84740e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
848b23bfdefSJunchao Zhang   PetscInt       nPetscComplex=0;
84940e23c03SJunchao Zhang #endif
85040e23c03SJunchao Zhang 
85140e23c03SJunchao Zhang   PetscFunctionBegin;
852b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
853b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
854b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
855b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
856b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
857b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
85840e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
859b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
86040e23c03SJunchao Zhang #endif
86140e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
86240e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
863b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
86440e23c03SJunchao Zhang   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
8655ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
866b23bfdefSJunchao Zhang   link->bs = 1; /* default */
86740e23c03SJunchao Zhang 
868eb02082bSJunchao Zhang   if (is2Int) {
869cd620004SJunchao Zhang     PackInit_PairType_int_int_1_1(link);
870eb02082bSJunchao Zhang     link->bs        = 1;
871eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(int);
8725ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
873eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
8745ad15460SJunchao Zhang     link->unit      = MPI_2INT;
875eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
876cd620004SJunchao Zhang     PackInit_PairType_PetscInt_PetscInt_1_1(link);
877eb02082bSJunchao Zhang     link->bs        = 1;
878eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(PetscInt);
879eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
8805ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
8815ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
882eb02082bSJunchao Zhang   } else if (nPetscReal) {
883b23bfdefSJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
884b23bfdefSJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
885b23bfdefSJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
886b23bfdefSJunchao Zhang     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
887b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
888eb02082bSJunchao Zhang     link->unitbytes = nPetscReal*sizeof(PetscReal);
88940e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
8905ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
891b23bfdefSJunchao Zhang   } else if (nPetscInt) {
892b23bfdefSJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
893b23bfdefSJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
894b23bfdefSJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
895b23bfdefSJunchao Zhang     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
896b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
897eb02082bSJunchao Zhang     link->unitbytes = nPetscInt*sizeof(PetscInt);
898b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
8995ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
900b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
901b23bfdefSJunchao Zhang   } else if (nInt) {
902b23bfdefSJunchao Zhang     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
903b23bfdefSJunchao Zhang     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
904b23bfdefSJunchao Zhang     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
905b23bfdefSJunchao Zhang     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
906b23bfdefSJunchao Zhang     link->bs        = nInt;
907eb02082bSJunchao Zhang     link->unitbytes = nInt*sizeof(int);
908b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
9095ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
910b23bfdefSJunchao Zhang #endif
911b23bfdefSJunchao Zhang   } else if (nSignedChar) {
912b23bfdefSJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
913b23bfdefSJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
914b23bfdefSJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
915b23bfdefSJunchao Zhang     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
916b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
917eb02082bSJunchao Zhang     link->unitbytes = nSignedChar*sizeof(SignedChar);
918b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
9195ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
920b23bfdefSJunchao Zhang   }  else if (nUnsignedChar) {
921b23bfdefSJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
922b23bfdefSJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
923b23bfdefSJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
924b23bfdefSJunchao Zhang     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
925b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
926eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
927b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
9285ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
92940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
930b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
931b23bfdefSJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
932b23bfdefSJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
933b23bfdefSJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
934b23bfdefSJunchao Zhang     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
935b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
936eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
93740e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
9385ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
93940e23c03SJunchao Zhang #endif
94040e23c03SJunchao Zhang   } else {
941b23bfdefSJunchao Zhang     MPI_Aint lb,nbyte;
942b23bfdefSJunchao Zhang     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
94340e23c03SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
944eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
945eb02082bSJunchao Zhang       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
946eb02082bSJunchao Zhang       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
947eb02082bSJunchao Zhang       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
948eb02082bSJunchao Zhang       link->bs        = nbyte;
949b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
950b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
95140e23c03SJunchao Zhang     } else {
952eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
953eb02082bSJunchao Zhang       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
954eb02082bSJunchao Zhang       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
955eb02082bSJunchao Zhang       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
956eb02082bSJunchao Zhang       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
957eb02082bSJunchao Zhang       link->bs        = nInt;
958b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
95940e23c03SJunchao Zhang       link->basicunit = MPI_INT;
96040e23c03SJunchao Zhang     }
9615ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
96240e23c03SJunchao Zhang   }
963b23bfdefSJunchao Zhang 
9645ad15460SJunchao Zhang   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
96540e23c03SJunchao Zhang   PetscFunctionReturn(0);
96640e23c03SJunchao Zhang }
96740e23c03SJunchao Zhang 
968cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*))
96940e23c03SJunchao Zhang {
97040e23c03SJunchao Zhang   PetscFunctionBegin;
97140e23c03SJunchao Zhang   *UnpackAndOp = NULL;
972eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
973eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
974eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
975eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
976eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
977eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
978eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
979eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
980eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
981eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
982eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
983eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
984eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
985eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
986eb02082bSJunchao Zhang   }
987eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
988eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
989eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
990eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
991eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
992eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
993eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
994eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
995eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
996eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
997eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
998eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
999eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
1000eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
1001eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
1002eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1003eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
1004eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
1005eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
1006eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
1007eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
1008eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
1009eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
1010eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
1011eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
1012eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
1013eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
1014eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
1015eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
1016eb02082bSJunchao Zhang   }
1017eb02082bSJunchao Zhang #endif
101840e23c03SJunchao Zhang   PetscFunctionReturn(0);
101940e23c03SJunchao Zhang }
102040e23c03SJunchao Zhang 
1021cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*))
102240e23c03SJunchao Zhang {
102340e23c03SJunchao Zhang   PetscFunctionBegin;
1024cd620004SJunchao Zhang   *ScatterAndOp = NULL;
1025eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
1026cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
1027cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
1028cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
1029cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
1030cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
1031cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
1032cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
1033cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
1034cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
1035cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
1036cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
1037cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
1038cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
1039eb02082bSJunchao Zhang   }
1040eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1041eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
1042cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
1043cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
1044cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
1045cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1046cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1047cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1048cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1049cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1050cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1051cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1052cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1053cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1054cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1055eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1056cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1057cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1058cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1059cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1060cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1061cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1062cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1063cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1064cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1065cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1066cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1067cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1068cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1069eb02082bSJunchao Zhang   }
1070eb02082bSJunchao Zhang #endif
1071cd620004SJunchao Zhang   PetscFunctionReturn(0);
1072cd620004SJunchao Zhang }
1073cd620004SJunchao Zhang 
1074cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*))
1075cd620004SJunchao Zhang {
1076cd620004SJunchao Zhang   PetscFunctionBegin;
1077cd620004SJunchao Zhang   *FetchAndOp = NULL;
1078cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1079cd620004SJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1080cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1081cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1082cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1083cd620004SJunchao Zhang #endif
1084cd620004SJunchao Zhang   PetscFunctionReturn(0);
1085cd620004SJunchao Zhang }
1086cd620004SJunchao Zhang 
1087cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*))
1088cd620004SJunchao Zhang {
1089cd620004SJunchao Zhang   PetscFunctionBegin;
1090cd620004SJunchao Zhang   *FetchAndOpLocal = NULL;
1091cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1092cd620004SJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1093cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1094cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1095cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1096cd620004SJunchao Zhang #endif
1097cd620004SJunchao Zhang   PetscFunctionReturn(0);
1098cd620004SJunchao Zhang }
1099cd620004SJunchao Zhang 
1100cd620004SJunchao Zhang /*=============================================================================
1101cd620004SJunchao Zhang               A set of helper routines for Pack/Unpack/Scatter on GPUs
1102cd620004SJunchao Zhang  ============================================================================*/
1103cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1104cd620004SJunchao Zhang /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
1105cd620004SJunchao Zhang    make sure the data is ready for packing.
1106cd620004SJunchao Zhang  */
1107cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
1108cd620004SJunchao Zhang {
1109cd620004SJunchao Zhang   PetscFunctionBegin;
1110cd620004SJunchao Zhang   if (sf->use_default_stream) PetscFunctionReturn(0);
1111cd620004SJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {
1112cd620004SJunchao Zhang     cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
1113cd620004SJunchao Zhang   }
1114cd620004SJunchao Zhang   PetscFunctionReturn(0);
1115cd620004SJunchao Zhang }
1116cd620004SJunchao Zhang 
1117cd620004SJunchao Zhang /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
1118cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
1119cd620004SJunchao Zhang {
1120cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1121cd620004SJunchao Zhang 
1122cd620004SJunchao Zhang   PetscFunctionBegin;
1123b85e67b7SJunchao Zhang   /* Do nothing if we use stream aware mpi || has nothing for remote */
1124b85e67b7SJunchao Zhang   if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
1125b85e67b7SJunchao Zhang   /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */
1126b85e67b7SJunchao Zhang   if (!link->rootdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi || sf->use_default_stream) {
1127cd620004SJunchao Zhang     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1128cd620004SJunchao Zhang   }
1129cd620004SJunchao Zhang   PetscFunctionReturn(0);
1130cd620004SJunchao Zhang }
1131cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
1132cd620004SJunchao Zhang {
1133cd620004SJunchao Zhang   PetscFunctionBegin;
1134b85e67b7SJunchao Zhang   /* See comments above */
1135b85e67b7SJunchao Zhang   if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
1136b85e67b7SJunchao Zhang   if (!link->leafdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi || sf->use_default_stream) {
1137cd620004SJunchao Zhang     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1138cd620004SJunchao Zhang   }
1139cd620004SJunchao Zhang   PetscFunctionReturn(0);
1140cd620004SJunchao Zhang }
1141cd620004SJunchao Zhang 
1142cd620004SJunchao Zhang /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
1143cd620004SJunchao Zhang    does not know which stream the callers will use.
1144cd620004SJunchao Zhang */
1145cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
1146cd620004SJunchao Zhang {
1147cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1148cd620004SJunchao Zhang 
1149cd620004SJunchao Zhang   PetscFunctionBegin;
1150b85e67b7SJunchao Zhang   /* Do nothing if we are expected to put rootdata on default stream */
1151b85e67b7SJunchao Zhang   if (sf->use_default_stream || link->rootmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0);
1152b85e67b7SJunchao Zhang   /* If we have something from local, then we called a scatter kernel (on link->stream), then we must sync it;
1153b85e67b7SJunchao Zhang      If we have something from remote and we called unpack kernel, then we must also sycn it.
1154b85e67b7SJunchao Zhang    */
1155b85e67b7SJunchao Zhang   if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {
1156cd620004SJunchao Zhang     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1157cd620004SJunchao Zhang   }
1158cd620004SJunchao Zhang   PetscFunctionReturn(0);
1159cd620004SJunchao Zhang }
1160cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
1161cd620004SJunchao Zhang {
1162cd620004SJunchao Zhang   PetscFunctionBegin;
1163b85e67b7SJunchao Zhang   /* See comments above */
1164b85e67b7SJunchao Zhang   if (sf->use_default_stream || link->leafmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0);
1165b85e67b7SJunchao Zhang   if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {
1166cd620004SJunchao Zhang     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1167cd620004SJunchao Zhang   }
1168cd620004SJunchao Zhang   PetscFunctionReturn(0);
1169cd620004SJunchao Zhang }
1170cd620004SJunchao Zhang 
1171cd620004SJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
1172cd620004SJunchao Zhang    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
1173cd620004SJunchao Zhang */
1174cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1175cd620004SJunchao Zhang {
1176cd620004SJunchao Zhang   PetscErrorCode ierr;
1177cd620004SJunchao Zhang   cudaError_t    cerr;
1178cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1179cd620004SJunchao Zhang 
1180cd620004SJunchao Zhang   PetscFunctionBegin;
1181cd620004SJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
1182cd620004SJunchao Zhang     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1183cd620004SJunchao Zhang     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1184cd620004SJunchao Zhang     size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
1185cd620004SJunchao Zhang     if (device2host) {
1186cd620004SJunchao Zhang       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1187cd620004SJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1188cd620004SJunchao Zhang     } else {
1189cd620004SJunchao Zhang       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1190cd620004SJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1191cd620004SJunchao Zhang     }
1192cd620004SJunchao Zhang   }
1193cd620004SJunchao Zhang   PetscFunctionReturn(0);
1194cd620004SJunchao Zhang }
1195cd620004SJunchao Zhang 
1196cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1197cd620004SJunchao Zhang {
1198cd620004SJunchao Zhang   PetscErrorCode ierr;
1199cd620004SJunchao Zhang   cudaError_t    cerr;
1200cd620004SJunchao Zhang 
1201cd620004SJunchao Zhang   PetscFunctionBegin;
1202cd620004SJunchao Zhang   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
1203cd620004SJunchao Zhang     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1204cd620004SJunchao Zhang     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1205cd620004SJunchao Zhang     size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
1206cd620004SJunchao Zhang     if (device2host) {
1207cd620004SJunchao Zhang       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1208cd620004SJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1209cd620004SJunchao Zhang     } else {
1210cd620004SJunchao Zhang       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1211cd620004SJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1212cd620004SJunchao Zhang     }
1213cd620004SJunchao Zhang   }
1214cd620004SJunchao Zhang   PetscFunctionReturn(0);
1215cd620004SJunchao Zhang }
1216cd620004SJunchao Zhang #else
1217cd620004SJunchao Zhang 
1218cd620004SJunchao Zhang #define PetscSFLinkSyncDeviceBeforePackData(a,b)                0
1219cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterPackRootData(a,b)             0
1220cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterPackLeafData(a,b)             0
1221cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b)           0
1222cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b)           0
1223cd620004SJunchao Zhang #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1224cd620004SJunchao Zhang #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1225cd620004SJunchao Zhang 
1226cd620004SJunchao Zhang #endif
1227cd620004SJunchao Zhang 
1228cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1229cd620004SJunchao Zhang {
1230cd620004SJunchao Zhang   PetscErrorCode ierr;
1231cd620004SJunchao Zhang   PetscLogDouble flops;
1232cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1233cd620004SJunchao Zhang 
1234cd620004SJunchao Zhang 
1235cd620004SJunchao Zhang   PetscFunctionBegin;
1236cd620004SJunchao Zhang   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1237cd620004SJunchao Zhang     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1238cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1239cd620004SJunchao Zhang     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1240cd620004SJunchao Zhang #endif
1241cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1242cd620004SJunchao Zhang   }
1243cd620004SJunchao Zhang   PetscFunctionReturn(0);
1244cd620004SJunchao Zhang }
1245cd620004SJunchao Zhang 
1246cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1247cd620004SJunchao Zhang {
1248cd620004SJunchao Zhang   PetscLogDouble flops;
1249cd620004SJunchao Zhang   PetscErrorCode ierr;
1250cd620004SJunchao Zhang 
1251cd620004SJunchao Zhang   PetscFunctionBegin;
1252cd620004SJunchao Zhang   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1253cd620004SJunchao Zhang     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1254cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1255cd620004SJunchao Zhang     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1256cd620004SJunchao Zhang #endif
1257cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1258cd620004SJunchao Zhang   }
1259cd620004SJunchao Zhang   PetscFunctionReturn(0);
1260cd620004SJunchao Zhang }
1261cd620004SJunchao Zhang 
1262cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1263cd620004SJunchao Zhang   Input Arguments:
1264cd620004SJunchao Zhang   +sf      - The StarForest
1265cd620004SJunchao Zhang   .link    - The link
1266cd620004SJunchao Zhang   .count   - Number of entries to unpack
1267cd620004SJunchao Zhang   .start   - The first index, significent when indices=NULL
1268cd620004SJunchao Zhang   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1269cd620004SJunchao Zhang   .buf     - A contiguous buffer to unpack from
1270cd620004SJunchao Zhang   -op      - Operation after unpack
1271cd620004SJunchao Zhang 
1272cd620004SJunchao Zhang   Output Arguments:
1273cd620004SJunchao Zhang   .data    - The data to unpack to
1274cd620004SJunchao Zhang */
1275cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1276cd620004SJunchao Zhang {
1277cd620004SJunchao Zhang   PetscFunctionBegin;
1278cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1279cd620004SJunchao Zhang   {
1280cd620004SJunchao Zhang     PetscErrorCode ierr;
1281cd620004SJunchao Zhang     PetscInt       i;
1282cd620004SJunchao Zhang     PetscMPIInt    n;
1283cd620004SJunchao Zhang     if (indices) {
1284cd620004SJunchao Zhang       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1285cd620004SJunchao Zhang          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1286cd620004SJunchao Zhang       */
1287cd620004SJunchao Zhang       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
1288cd620004SJunchao Zhang     } else {
1289cd620004SJunchao Zhang       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1290cd620004SJunchao Zhang       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr);
1291cd620004SJunchao Zhang     }
1292cd620004SJunchao Zhang   }
1293cd620004SJunchao Zhang #else
1294cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1295cd620004SJunchao Zhang #endif
1296cd620004SJunchao Zhang   PetscFunctionReturn(0);
1297cd620004SJunchao Zhang }
1298cd620004SJunchao Zhang 
1299cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata,MPI_Op op)
1300cd620004SJunchao Zhang {
1301cd620004SJunchao Zhang   PetscFunctionBegin;
1302cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1303cd620004SJunchao Zhang   {
1304cd620004SJunchao Zhang     PetscErrorCode ierr;
1305cd620004SJunchao Zhang     PetscInt       i,disp;
1306cd620004SJunchao Zhang     for (i=0; i<count; i++) {
1307cd620004SJunchao Zhang       disp = idy? idy[i] : starty + i;
1308cd620004SJunchao Zhang       ierr = MPI_Reduce_local((const char*)xdata+idx[i]*link->unitbytes,(char*)ydata+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);
1309cd620004SJunchao Zhang     }
1310cd620004SJunchao Zhang   }
1311cd620004SJunchao Zhang #else
1312cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1313cd620004SJunchao Zhang #endif
1314cd620004SJunchao Zhang   PetscFunctionReturn(0);
1315cd620004SJunchao Zhang }
1316cd620004SJunchao Zhang 
1317cd620004SJunchao Zhang /*=============================================================================
1318cd620004SJunchao Zhang               Pack/Unpack/Fetch/Scatter routines
1319cd620004SJunchao Zhang  ============================================================================*/
1320cd620004SJunchao Zhang 
1321cd620004SJunchao Zhang /* Pack rootdata to rootbuf
1322cd620004SJunchao Zhang   Input Arguments:
1323cd620004SJunchao Zhang   + sf       - The SF this packing works on.
1324cd620004SJunchao Zhang   . link     - It gives the memtype of the roots and also provides root buffer.
1325cd620004SJunchao Zhang   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1326cd620004SJunchao Zhang   - rootdata - Where to read the roots.
1327cd620004SJunchao Zhang 
1328cd620004SJunchao Zhang   Notes:
1329cd620004SJunchao Zhang   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1330cd620004SJunchao Zhang   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1331cd620004SJunchao Zhang  */
1332cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1333cd620004SJunchao Zhang {
1334cd620004SJunchao Zhang   PetscErrorCode   ierr;
1335cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1336cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1337cd620004SJunchao Zhang   PetscInt         count,start;
1338cd620004SJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL;
1339cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1340cd620004SJunchao Zhang 
1341cd620004SJunchao Zhang   PetscFunctionBegin;
1342cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1343cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1344cd620004SJunchao Zhang   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1345cd620004SJunchao Zhang     ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1346cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1347cd620004SJunchao Zhang     ierr = (*Pack)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1348cd620004SJunchao Zhang   }
1349cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1350cd620004SJunchao Zhang     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1351cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1352cd620004SJunchao Zhang   }
1353cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1354cd620004SJunchao Zhang   PetscFunctionReturn(0);
1355cd620004SJunchao Zhang }
1356cd620004SJunchao Zhang 
1357cd620004SJunchao Zhang /* Pack leafdata to leafbuf */
1358cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1359cd620004SJunchao Zhang {
1360cd620004SJunchao Zhang   PetscErrorCode   ierr;
1361cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1362cd620004SJunchao Zhang   PetscInt         count,start;
1363cd620004SJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL;
1364cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1365cd620004SJunchao Zhang 
1366cd620004SJunchao Zhang   PetscFunctionBegin;
1367cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1368cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1369cd620004SJunchao Zhang   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1370cd620004SJunchao Zhang     ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr);
1371cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1372cd620004SJunchao Zhang     ierr = (*Pack)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1373cd620004SJunchao Zhang   }
1374cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1375cd620004SJunchao Zhang     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1376cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1377cd620004SJunchao Zhang   }
1378cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1379cd620004SJunchao Zhang   PetscFunctionReturn(0);
1380cd620004SJunchao Zhang }
1381cd620004SJunchao Zhang 
1382cd620004SJunchao Zhang /* Unpack rootbuf to rootdata */
1383cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1384cd620004SJunchao Zhang {
1385cd620004SJunchao Zhang   PetscErrorCode   ierr;
1386cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1387cd620004SJunchao Zhang   PetscInt         count,start;
1388cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1389cd620004SJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1390cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1391cd620004SJunchao Zhang 
1392cd620004SJunchao Zhang   PetscFunctionBegin;
1393cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1394cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1395cd620004SJunchao Zhang   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1396cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1397cd620004SJunchao Zhang     if (UnpackAndOp) {
1398cd620004SJunchao Zhang       ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1399cd620004SJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1400cd620004SJunchao Zhang     } else {
1401cd620004SJunchao Zhang       ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1402cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1403cd620004SJunchao Zhang     }
1404cd620004SJunchao Zhang   }
1405cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1406cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1407cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1408cd620004SJunchao Zhang   PetscFunctionReturn(0);
1409cd620004SJunchao Zhang }
1410cd620004SJunchao Zhang 
1411cd620004SJunchao Zhang /* Unpack leafbuf to leafdata */
1412cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1413cd620004SJunchao Zhang {
1414cd620004SJunchao Zhang   PetscErrorCode   ierr;
1415cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1416cd620004SJunchao Zhang   PetscInt         count,start;
1417cd620004SJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1418cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1419cd620004SJunchao Zhang 
1420cd620004SJunchao Zhang   PetscFunctionBegin;
1421cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1422cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1423cd620004SJunchao Zhang   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1424cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1425cd620004SJunchao Zhang     if (UnpackAndOp) {
1426cd620004SJunchao Zhang       ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr);
1427cd620004SJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1428cd620004SJunchao Zhang     } else {
1429cd620004SJunchao Zhang       ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&leafindices);CHKERRQ(ierr);
1430cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1431cd620004SJunchao Zhang     }
1432cd620004SJunchao Zhang   }
1433cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1434cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1435cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1436cd620004SJunchao Zhang   PetscFunctionReturn(0);
1437cd620004SJunchao Zhang }
1438cd620004SJunchao Zhang 
1439cd620004SJunchao Zhang /* FetchAndOp rootdata with rootbuf */
1440cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1441cd620004SJunchao Zhang {
1442cd620004SJunchao Zhang   PetscErrorCode     ierr;
1443cd620004SJunchao Zhang   const PetscInt     *rootindices = NULL;
1444cd620004SJunchao Zhang   PetscInt           count,start;
1445cd620004SJunchao Zhang   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1446cd620004SJunchao Zhang   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*) = NULL;
1447cd620004SJunchao Zhang   PetscMemType       rootmtype = link->rootmtype;
1448cd620004SJunchao Zhang 
1449cd620004SJunchao Zhang   PetscFunctionBegin;
1450cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1451cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1452cd620004SJunchao Zhang   if (bas->rootbuflen[scope]) {
1453cd620004SJunchao Zhang     /* Do FetchAndOp on rootdata with rootbuf */
1454cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1455cd620004SJunchao Zhang     ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1456cd620004SJunchao Zhang     ierr = (*FetchAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1457cd620004SJunchao Zhang   }
1458cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1459cd620004SJunchao Zhang     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1460cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1461cd620004SJunchao Zhang   }
1462cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1463cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1464cd620004SJunchao Zhang   PetscFunctionReturn(0);
1465cd620004SJunchao Zhang }
1466cd620004SJunchao Zhang 
1467cd620004SJunchao Zhang /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1468cd620004SJunchao Zhang PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1469cd620004SJunchao Zhang {
1470cd620004SJunchao Zhang   PetscErrorCode       ierr;
1471cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1472cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1473cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1474cd620004SJunchao Zhang   PetscErrorCode       (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1475cd620004SJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL;
1476cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1477cd620004SJunchao Zhang 
1478cd620004SJunchao Zhang   PetscFunctionBegin;
1479cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1480cd620004SJunchao Zhang   if (rootmtype != leafmtype) { /* Uncommon case */
1481cd620004SJunchao Zhang      /* The local communication has to go through pack and unpack */
1482cd620004SJunchao Zhang     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1483*f01131f0SJunchao Zhang     ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1484cd620004SJunchao Zhang     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1485cd620004SJunchao Zhang   } else {
1486cd620004SJunchao Zhang     if (bas->rootcontig[PETSCSF_LOCAL]) { /* If root indices are contiguous, Scatter becomes Unpack */
1487cd620004SJunchao Zhang       ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr);
1488cd620004SJunchao Zhang       rootdata = (const char*)rootdata + bas->rootstart[PETSCSF_LOCAL]*link->unitbytes; /* Make rootdata point to start of the buffer */
1489cd620004SJunchao Zhang       if (UnpackAndOp) {
1490cd620004SJunchao Zhang         ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr);
1491cd620004SJunchao Zhang         ierr = (*UnpackAndOp)(link,count,leafstart,leafindices,sf->leafpackopt[PETSCSF_LOCAL],leafdata,rootdata);CHKERRQ(ierr);
1492cd620004SJunchao Zhang       } else {
1493cd620004SJunchao Zhang         ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr);
1494cd620004SJunchao Zhang         ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootdata,op);CHKERRQ(ierr);
1495cd620004SJunchao Zhang       }
1496cd620004SJunchao Zhang     } else { /* ScatterAndOp */
1497cd620004SJunchao Zhang       ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1498cd620004SJunchao Zhang       if (ScatterAndOp) {
1499cd620004SJunchao Zhang         ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1500cd620004SJunchao Zhang         ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr);
1501cd620004SJunchao Zhang         ierr = (*ScatterAndOp)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata);CHKERRQ(ierr);
1502cd620004SJunchao Zhang       } else {
1503cd620004SJunchao Zhang         ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,NULL,&rootindices);CHKERRQ(ierr);
1504cd620004SJunchao Zhang         ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr);
1505cd620004SJunchao Zhang         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1506cd620004SJunchao Zhang       }
1507cd620004SJunchao Zhang     }
1508cd620004SJunchao Zhang   }
1509cd620004SJunchao Zhang   PetscFunctionReturn(0);
1510cd620004SJunchao Zhang }
1511cd620004SJunchao Zhang 
1512cd620004SJunchao Zhang /* Reduce leafdata to rootdata locally */
1513cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1514cd620004SJunchao Zhang {
1515cd620004SJunchao Zhang   PetscErrorCode       ierr;
1516cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1517cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1518cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1519cd620004SJunchao Zhang   PetscErrorCode       (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1520cd620004SJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL;
1521cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1522cd620004SJunchao Zhang 
1523cd620004SJunchao Zhang   PetscFunctionBegin;
1524cd620004SJunchao Zhang   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1525cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1526cd620004SJunchao Zhang     /* The local communication has to go through pack and unpack */
1527cd620004SJunchao Zhang     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1528*f01131f0SJunchao Zhang     ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1529cd620004SJunchao Zhang     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1530cd620004SJunchao Zhang   } else {
1531cd620004SJunchao Zhang     if (sf->leafcontig[PETSCSF_LOCAL]) {
1532cd620004SJunchao Zhang       /* If leaf indices are contiguous, Scatter becomes Unpack */
1533cd620004SJunchao Zhang       ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr);
1534cd620004SJunchao Zhang       leafdata = (const char*)leafdata + sf->leafstart[PETSCSF_LOCAL]*link->unitbytes; /* Make leafdata point to start of the buffer */
1535cd620004SJunchao Zhang       if (UnpackAndOp) {
1536cd620004SJunchao Zhang         ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1537cd620004SJunchao Zhang         ierr = (*UnpackAndOp)(link,count,rootstart,rootindices,bas->rootpackopt[PETSCSF_LOCAL],rootdata,leafdata);CHKERRQ(ierr);
1538cd620004SJunchao Zhang       } else {
1539cd620004SJunchao Zhang         ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1540cd620004SJunchao Zhang         ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafdata,op);CHKERRQ(ierr);
1541cd620004SJunchao Zhang       }
1542cd620004SJunchao Zhang     } else {
1543cd620004SJunchao Zhang       ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1544cd620004SJunchao Zhang       if (ScatterAndOp) {
1545cd620004SJunchao Zhang         ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1546cd620004SJunchao Zhang         ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr);
1547cd620004SJunchao Zhang         ierr = (*ScatterAndOp)(link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata);CHKERRQ(ierr);
1548cd620004SJunchao Zhang       } else {
1549cd620004SJunchao Zhang         ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1550cd620004SJunchao Zhang         ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,NULL,&leafindices);CHKERRQ(ierr);
1551cd620004SJunchao Zhang         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1552cd620004SJunchao Zhang       }
1553cd620004SJunchao Zhang     }
1554cd620004SJunchao Zhang   }
1555cd620004SJunchao Zhang   PetscFunctionReturn(0);
1556cd620004SJunchao Zhang }
1557cd620004SJunchao Zhang 
1558cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */
1559cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1560cd620004SJunchao Zhang {
1561cd620004SJunchao Zhang   PetscErrorCode       ierr;
1562cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1563cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1564cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1565cd620004SJunchao Zhang   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*) = NULL;
1566cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1567cd620004SJunchao Zhang 
1568cd620004SJunchao Zhang   PetscFunctionBegin;
1569cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1570cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1571cd620004SJunchao Zhang    /* The local communication has to go through pack and unpack */
1572cd620004SJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1573cd620004SJunchao Zhang   } else {
1574cd620004SJunchao Zhang     ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1575cd620004SJunchao Zhang     ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr);
1576cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1577cd620004SJunchao Zhang     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1578cd620004SJunchao Zhang   }
157940e23c03SJunchao Zhang   PetscFunctionReturn(0);
158040e23c03SJunchao Zhang }
158140e23c03SJunchao Zhang 
158240e23c03SJunchao Zhang /*
1583cd620004SJunchao Zhang   Create per-rank pack/unpack optimizations based on indice patterns
158440e23c03SJunchao Zhang 
158540e23c03SJunchao Zhang    Input Parameters:
1586b23bfdefSJunchao Zhang   +  n       - Number of target ranks
1587eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1588b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
158940e23c03SJunchao Zhang 
159040e23c03SJunchao Zhang    Output Parameters:
1591cd620004SJunchao Zhang   +  opt     - Pack optimizations. NULL if no optimizations.
159240e23c03SJunchao Zhang */
1593cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
159440e23c03SJunchao Zhang {
159540e23c03SJunchao Zhang   PetscErrorCode ierr;
159640e23c03SJunchao Zhang   PetscInt       i,j,k,n_copies,tot_copies = 0,step;
1597b23bfdefSJunchao Zhang   PetscBool      strided,optimized = PETSC_FALSE;
159840e23c03SJunchao Zhang   PetscSFPackOpt opt;
159940e23c03SJunchao Zhang 
160040e23c03SJunchao Zhang   PetscFunctionBegin;
1601b23bfdefSJunchao Zhang   if (!n) {
1602b23bfdefSJunchao Zhang     *out = NULL;
1603b23bfdefSJunchao Zhang     PetscFunctionReturn(0);
1604b23bfdefSJunchao Zhang   }
1605b23bfdefSJunchao Zhang 
160640e23c03SJunchao Zhang   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
1607b23bfdefSJunchao Zhang   ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr);
1608b23bfdefSJunchao Zhang   ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr);
1609cd620004SJunchao Zhang   /* Make opt->offset[] zero-based. If one calls this routine with non-zero offset[0], one should use packing routine in this way, Pack(count,idx+offset[0],packopt,...) */
1610cd620004SJunchao Zhang   if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];}
1611b23bfdefSJunchao Zhang 
1612b23bfdefSJunchao Zhang   opt->n = n;
161340e23c03SJunchao Zhang 
1614fb61b9e4SStefano Zampini   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with multiple memcpy's ) */
161540e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor */
161640e23c03SJunchao Zhang     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
161740e23c03SJunchao Zhang     n_copies = 1;
161840e23c03SJunchao Zhang     for (j=offset[i]; j<offset[i+1]-1; j++) {
161940e23c03SJunchao Zhang       if (idx[j]+1 != idx[j+1]) n_copies++;
162040e23c03SJunchao Zhang     }
162140e23c03SJunchao Zhang     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
162240e23c03SJunchao Zhang        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
162340e23c03SJunchao Zhang      */
162440e23c03SJunchao Zhang     if ((offset[i+1]-offset[i])/n_copies >= 32) {
1625b23bfdefSJunchao Zhang       opt->type[i] = PETSCSF_PACKOPT_MULTICOPY;
1626b23bfdefSJunchao Zhang       optimized    = PETSC_TRUE;
162740e23c03SJunchao Zhang       tot_copies  += n_copies;
162840e23c03SJunchao Zhang     }
162940e23c03SJunchao Zhang   }
163040e23c03SJunchao Zhang 
163140e23c03SJunchao Zhang   /* Setup memcpy plan for each contiguous piece */
163240e23c03SJunchao Zhang   k    = 0; /* k-th copy */
1633b23bfdefSJunchao Zhang   ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
1634b23bfdefSJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor */
1635b23bfdefSJunchao Zhang     if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) {
163640e23c03SJunchao Zhang       n_copies           = 1;
1637b23bfdefSJunchao Zhang       opt->copy_start[k] = offset[i] - offset[0];
163840e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
163940e23c03SJunchao Zhang         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
164040e23c03SJunchao Zhang           n_copies++;
1641b23bfdefSJunchao Zhang           opt->copy_start[k+1] = j-offset[0]+1;
1642b23bfdefSJunchao Zhang           opt->copy_length[k]  = opt->copy_start[k+1] - opt->copy_start[k];
164340e23c03SJunchao Zhang           k++;
164440e23c03SJunchao Zhang         }
164540e23c03SJunchao Zhang       }
164640e23c03SJunchao Zhang       /* Set copy length of the last copy for this target */
1647b23bfdefSJunchao Zhang       opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k];
164840e23c03SJunchao Zhang       k++;
164940e23c03SJunchao Zhang     }
1650b23bfdefSJunchao Zhang     /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */
165140e23c03SJunchao Zhang     opt->copy_offset[i+1] = k;
165240e23c03SJunchao Zhang   }
165340e23c03SJunchao Zhang 
165440e23c03SJunchao Zhang   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
165540e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each remote */
1656b23bfdefSJunchao Zhang     if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
165740e23c03SJunchao Zhang       strided = PETSC_TRUE;
165840e23c03SJunchao Zhang       step    = idx[offset[i]+1] - idx[offset[i]];
165940e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
166040e23c03SJunchao Zhang         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
166140e23c03SJunchao Zhang       }
166240e23c03SJunchao Zhang       if (strided) {
1663b23bfdefSJunchao Zhang         opt->type[i]         = PETSCSF_PACKOPT_STRIDE;
166440e23c03SJunchao Zhang         opt->stride_step[i]  = step;
166540e23c03SJunchao Zhang         opt->stride_n[i]     = offset[i+1] - offset[i];
1666b23bfdefSJunchao Zhang         optimized            = PETSC_TRUE;
166740e23c03SJunchao Zhang       }
166840e23c03SJunchao Zhang     }
166940e23c03SJunchao Zhang   }
1670b23bfdefSJunchao Zhang   /* If no rank gets optimized, free arrays to save memory */
1671b23bfdefSJunchao Zhang   if (!optimized) {
1672b23bfdefSJunchao Zhang     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
1673b23bfdefSJunchao Zhang     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
167440e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
167540e23c03SJunchao Zhang     *out = NULL;
167640e23c03SJunchao Zhang   } else *out = opt;
167740e23c03SJunchao Zhang   PetscFunctionReturn(0);
167840e23c03SJunchao Zhang }
167940e23c03SJunchao Zhang 
1680cd620004SJunchao Zhang PetscErrorCode PetscSFDestroyPackOpt(PetscSFPackOpt *out)
168140e23c03SJunchao Zhang {
168240e23c03SJunchao Zhang   PetscErrorCode ierr;
168340e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
168440e23c03SJunchao Zhang 
168540e23c03SJunchao Zhang   PetscFunctionBegin;
168640e23c03SJunchao Zhang   if (opt) {
1687b23bfdefSJunchao Zhang     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
1688b23bfdefSJunchao Zhang     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
168940e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
169040e23c03SJunchao Zhang     *out = NULL;
169140e23c03SJunchao Zhang   }
169240e23c03SJunchao Zhang   PetscFunctionReturn(0);
169340e23c03SJunchao Zhang }
1694cd620004SJunchao Zhang 
1695cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1696cd620004SJunchao Zhang {
1697cd620004SJunchao Zhang   PetscErrorCode ierr;
1698cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1699cd620004SJunchao Zhang   PetscInt       i,j;
1700cd620004SJunchao Zhang 
1701cd620004SJunchao Zhang   PetscFunctionBegin;
1702cd620004SJunchao Zhang   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1703cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Set defaults */
1704cd620004SJunchao Zhang     sf->leafstart[i]   = 0;
1705cd620004SJunchao Zhang     sf->leafcontig[i]  = PETSC_TRUE;
1706cd620004SJunchao Zhang     sf->leafdups[i]    = PETSC_FALSE;
1707cd620004SJunchao Zhang     bas->rootstart[i]  = 0;
1708cd620004SJunchao Zhang     bas->rootcontig[i] = PETSC_TRUE;
1709cd620004SJunchao Zhang     bas->rootdups[i]   = PETSC_FALSE;
1710cd620004SJunchao Zhang   }
1711cd620004SJunchao Zhang 
1712cd620004SJunchao Zhang   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1713cd620004SJunchao Zhang   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1714cd620004SJunchao Zhang 
1715cd620004SJunchao Zhang   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1716cd620004SJunchao Zhang   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1717cd620004SJunchao Zhang 
1718cd620004SJunchao Zhang   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1719cd620004SJunchao Zhang   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1720cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1721cd620004SJunchao Zhang   }
1722cd620004SJunchao Zhang   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1723cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1724cd620004SJunchao Zhang   }
1725cd620004SJunchao Zhang 
1726cd620004SJunchao Zhang   /* If not, see if we can have per-rank optimizations by doing index analysis */
1727cd620004SJunchao Zhang   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1728cd620004SJunchao Zhang   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1729cd620004SJunchao Zhang 
1730cd620004SJunchao Zhang   /* Are root indices for self and remote contiguous? */
1731cd620004SJunchao Zhang   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1732cd620004SJunchao Zhang   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1733cd620004SJunchao Zhang 
1734cd620004SJunchao Zhang   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1735cd620004SJunchao Zhang   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1736cd620004SJunchao Zhang 
1737cd620004SJunchao Zhang   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1738cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1739cd620004SJunchao Zhang   }
1740cd620004SJunchao Zhang   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1741cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1742cd620004SJunchao Zhang   }
1743cd620004SJunchao Zhang 
1744cd620004SJunchao Zhang   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1745cd620004SJunchao Zhang   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1746cd620004SJunchao Zhang 
1747cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1748cd620004SJunchao Zhang     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1749cd620004SJunchao Zhang   if (!sf->leafcontig[0])  {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1750cd620004SJunchao Zhang   if (!sf->leafcontig[1])  {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1751cd620004SJunchao Zhang   if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1752cd620004SJunchao Zhang   if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1753cd620004SJunchao Zhang #endif
1754cd620004SJunchao Zhang 
1755cd620004SJunchao Zhang   PetscFunctionReturn(0);
1756cd620004SJunchao Zhang }
1757cd620004SJunchao Zhang 
1758cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1759cd620004SJunchao Zhang {
1760cd620004SJunchao Zhang   PetscErrorCode ierr;
1761cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1762cd620004SJunchao Zhang   PetscInt       i;
1763cd620004SJunchao Zhang 
1764cd620004SJunchao Zhang   PetscFunctionBegin;
1765cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1766cd620004SJunchao Zhang     ierr = PetscSFDestroyPackOpt(&sf->leafpackopt[i]);CHKERRQ(ierr);
1767cd620004SJunchao Zhang     ierr = PetscSFDestroyPackOpt(&bas->rootpackopt[i]);CHKERRQ(ierr);
1768cd620004SJunchao Zhang   }
1769cd620004SJunchao Zhang   PetscFunctionReturn(0);
1770cd620004SJunchao Zhang }
1771