xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision cd62000487352d7c447823794a883667dfb859d7)
140e23c03SJunchao Zhang 
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
5eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
6eb02082bSJunchao Zhang #include <cuda_runtime.h>
7eb02082bSJunchao Zhang #endif
840e23c03SJunchao Zhang /*
940e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
1040e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1140e23c03SJunchao Zhang  */
1240e23c03SJunchao Zhang 
13*cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
1440e23c03SJunchao Zhang 
15*cd620004SJunchao Zhang /* Operations working like s += t */
16*cd620004SJunchao Zhang #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
17*cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
18*cd620004SJunchao Zhang #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
19*cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while(0)
20*cd620004SJunchao Zhang /* Ref MPI MAXLOC */
21*cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \
22*cd620004SJunchao Zhang   do {                                       \
23*cd620004SJunchao Zhang     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
24*cd620004SJunchao Zhang     else if (!((s).u op (t).u)) s = t;           \
25*cd620004SJunchao Zhang   } while(0)
2640e23c03SJunchao Zhang 
2740e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
2840e23c03SJunchao Zhang 
2940e23c03SJunchao Zhang    Arguments of the macro:
30b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
31b23bfdefSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bs.
32b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
3340e23c03SJunchao Zhang 
3440e23c03SJunchao Zhang    Arguments of the Pack routine:
35*cd620004SJunchao Zhang    +count     Number of indices in idx[].
36*cd620004SJunchao Zhang    .start     If indices are contiguous, it is the first index; otherwise, not used.
37*cd620004SJunchao Zhang    .idx       Indices of entries to packed. NULL means contiguous indices, that is [start,start+count).
38eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39*cd620004SJunchao Zhang    .opt       Per-remote pack optimization plan. NULL means no such plan.
40*cd620004SJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
41*cd620004SJunchao Zhang    -packed    Address of the packed data.
4240e23c03SJunchao Zhang  */
43b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \
44*cd620004SJunchao Zhang   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,const void *unpacked,void *packed) \
45b23bfdefSJunchao Zhang   {                                                                                                          \
4640e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
47b23bfdefSJunchao Zhang     const Type     *u = (const Type*)unpacked,*u2;                                                           \
48b23bfdefSJunchao Zhang     Type           *p = (Type*)packed,*p2;                                                                   \
49eb02082bSJunchao Zhang     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
50b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
51b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
5240e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
53*cd620004SJunchao Zhang     if (!idx) {ierr = PetscArraycpy(p,u+start*bs,MBS*count);CHKERRQ(ierr);}  /* Indices are contiguous */    \
54b23bfdefSJunchao Zhang     else if (!opt) { /* No optimizations available */                                                        \
55b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
56eb02082bSJunchao Zhang         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
57eb02082bSJunchao Zhang           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
58b23bfdefSJunchao Zhang             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
59b23bfdefSJunchao Zhang     } else {                                                                                                 \
60b23bfdefSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
61b23bfdefSJunchao Zhang         p2  = p + opt->offset[r]*MBS;                                                                        \
62b23bfdefSJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
63b23bfdefSJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
64b23bfdefSJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
65b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
66b23bfdefSJunchao Zhang               for (k=0; k<BS; k++)                                                                           \
67b23bfdefSJunchao Zhang                 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k];                                                    \
68b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
6940e23c03SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
70b23bfdefSJunchao Zhang             u2   = u + idx[opt->copy_start[i]]*MBS;                                                          \
71b23bfdefSJunchao Zhang             l    = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */                       \
72b23bfdefSJunchao Zhang             ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr);                                                     \
73b23bfdefSJunchao Zhang             p2  += l;                                                                                        \
7440e23c03SJunchao Zhang           }                                                                                                  \
75b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
76b23bfdefSJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
7740e23c03SJunchao Zhang           step = opt->stride_step[r];                                                                        \
7840e23c03SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
79b23bfdefSJunchao Zhang             for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j];                                            \
80b23bfdefSJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
8140e23c03SJunchao Zhang       }                                                                                                      \
8240e23c03SJunchao Zhang     }                                                                                                        \
8340e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
8440e23c03SJunchao Zhang   }
8540e23c03SJunchao Zhang 
86*cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
87*cd620004SJunchao Zhang                 and inserts into a sparse array.
8840e23c03SJunchao Zhang 
8940e23c03SJunchao Zhang    Arguments:
90b23bfdefSJunchao Zhang   .Type       Type of the data
9140e23c03SJunchao Zhang   .BS         Block size for vectorization
92b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
9340e23c03SJunchao Zhang 
9440e23c03SJunchao Zhang   Notes:
9540e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
9640e23c03SJunchao Zhang  */
97*cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ)               \
98*cd620004SJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \
99b23bfdefSJunchao Zhang   {                                                                                                          \
10040e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
101b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
102*cd620004SJunchao Zhang     const Type     *p = (const Type*)packed,*p2;                                                             \
103eb02082bSJunchao Zhang     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
104b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
105b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
10640e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
107b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
108*cd620004SJunchao Zhang       u2 = u + start*bs;                                                                                     \
109*cd620004SJunchao Zhang       if (u2 != p) {ierr = PetscArraycpy(u2,p,count*MBS);CHKERRQ(ierr);}                                     \
110b23bfdefSJunchao Zhang     } else if (!opt) { /* No optimizations available */                                                      \
111b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
112b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
113*cd620004SJunchao Zhang           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
114b23bfdefSJunchao Zhang     } else {                                                                                                 \
115b23bfdefSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
116b23bfdefSJunchao Zhang         p2 = p + opt->offset[r]*MBS;                                                                         \
117b23bfdefSJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
118b23bfdefSJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
119b23bfdefSJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
120b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
121*cd620004SJunchao Zhang               for (k=0; k<BS; k++) u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                 \
122b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
12340e23c03SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
124b23bfdefSJunchao Zhang             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
125b23bfdefSJunchao Zhang             l  = opt->copy_length[i]*MBS;                                                                    \
126b23bfdefSJunchao Zhang             ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr);                                                     \
127b23bfdefSJunchao Zhang             p2 += l;                                                                                         \
12840e23c03SJunchao Zhang           }                                                                                                  \
129b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
130b23bfdefSJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
13140e23c03SJunchao Zhang           step = opt->stride_step[r];                                                                        \
13240e23c03SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
133b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
134*cd620004SJunchao Zhang               for (k=0; k<BS; k++) u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                 \
135b23bfdefSJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
13640e23c03SJunchao Zhang       }                                                                                                      \
13740e23c03SJunchao Zhang     }                                                                                                        \
13840e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
13940e23c03SJunchao Zhang   }
14040e23c03SJunchao Zhang 
141*cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
14240e23c03SJunchao Zhang 
14340e23c03SJunchao Zhang    Arguments:
144*cd620004SJunchao Zhang   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
145b23bfdefSJunchao Zhang   .Type       Type of the data
14640e23c03SJunchao Zhang   .BS         Block size for vectorization
147b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
148*cd620004SJunchao Zhang   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
149*cd620004SJunchao Zhang   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
15040e23c03SJunchao Zhang  */
151*cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
152*cd620004SJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \
153b23bfdefSJunchao Zhang   {                                                                                                          \
154*cd620004SJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
155*cd620004SJunchao Zhang     const Type     *p = (const Type*)packed,*p2;                                                             \
156eb02082bSJunchao Zhang     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
157b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
158b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
15940e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
160b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
161*cd620004SJunchao Zhang       u += start*bs;                                                                                         \
162*cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
163*cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
164*cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
165*cd620004SJunchao Zhang             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
166*cd620004SJunchao Zhang     } else if (!opt) { /* No optimizations available */                                                      \
167*cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
168*cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
169*cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
170*cd620004SJunchao Zhang               OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                              \
171*cd620004SJunchao Zhang     } else {                                                                                                 \
172*cd620004SJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
173*cd620004SJunchao Zhang         p2 = p + opt->offset[r]*MBS;                                                                         \
174*cd620004SJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
175*cd620004SJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
176*cd620004SJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
177*cd620004SJunchao Zhang             for (j=0; j<M; j++)                                                                              \
178*cd620004SJunchao Zhang               for (k=0; k<BS; k++)                                                                           \
179*cd620004SJunchao Zhang                 OpApply(Op,u[idx2[i]*MBS+j*BS+k],p2[i*MBS+j*BS+k]);                                          \
180*cd620004SJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
181*cd620004SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
182*cd620004SJunchao Zhang             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
183*cd620004SJunchao Zhang             l  = opt->copy_length[i]*MBS;                                                                    \
184*cd620004SJunchao Zhang             for (j=0; j<l; j++) OpApply(Op,u2[j],p2[j]);                                                     \
185*cd620004SJunchao Zhang             p2 += l;                                                                                         \
186*cd620004SJunchao Zhang           }                                                                                                  \
187*cd620004SJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
188*cd620004SJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
189*cd620004SJunchao Zhang           step = opt->stride_step[r];                                                                        \
190*cd620004SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
191*cd620004SJunchao Zhang             for (j=0; j<M; j++)                                                                              \
192*cd620004SJunchao Zhang               for (k=0; k<BS; k++)                                                                           \
193*cd620004SJunchao Zhang                 OpApply(Op,u2[i*step*MBS+j*BS+k],p2[i*MBS+j*BS+k]);                                          \
194*cd620004SJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
195*cd620004SJunchao Zhang       }                                                                                                      \
196*cd620004SJunchao Zhang     }                                                                                                        \
197*cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
198*cd620004SJunchao Zhang   }
199*cd620004SJunchao Zhang 
200*cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
201*cd620004SJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,void *packed) \
202*cd620004SJunchao Zhang   {                                                                                                          \
203*cd620004SJunchao Zhang     Type           *u = (Type*)unpacked,*p = (Type*)packed,t;                                                \
204*cd620004SJunchao Zhang     PetscInt       i,j,k,bs=link->bs;                                                                        \
205*cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
206*cd620004SJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
207*cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
208*cd620004SJunchao Zhang     if (!idx) {                                                                                              \
209*cd620004SJunchao Zhang       u += start*bs;                                                                                         \
210b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
211b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
212b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
213b23bfdefSJunchao Zhang             t = u[i*MBS+j*BS+k];                                                                             \
214*cd620004SJunchao Zhang             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
215*cd620004SJunchao Zhang             p[i*MBS+j*BS+k] = t;                                                                             \
21640e23c03SJunchao Zhang           }                                                                                                  \
217*cd620004SJunchao Zhang     } else {                                                                                                 \
218b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
219b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
220b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
221b23bfdefSJunchao Zhang               t = u[idx[i]*MBS+j*BS+k];                                                                      \
222*cd620004SJunchao Zhang               OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                              \
223*cd620004SJunchao Zhang               p[i*MBS+j*BS+k] = t;                                                                           \
224*cd620004SJunchao Zhang           }                                                                                                  \
225*cd620004SJunchao Zhang     }                                                                                                        \
226*cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
227*cd620004SJunchao Zhang   }
228*cd620004SJunchao Zhang 
229*cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
230*cd620004SJunchao Zhang   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt startx,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata) \
231*cd620004SJunchao Zhang   {                                                                                                          \
232*cd620004SJunchao Zhang     const Type     *x = (const Type*)xdata;                                                                  \
233*cd620004SJunchao Zhang     Type           *y = (Type*)ydata;                                                                        \
234*cd620004SJunchao Zhang     PetscInt       i,j,k,bs = link->bs;                                                                      \
235*cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
236*cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
237*cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
238*cd620004SJunchao Zhang     if (!idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use UnpackAndOp instead of ScatterAndOp");\
239*cd620004SJunchao Zhang     if (!idy) {                                                                                              \
240*cd620004SJunchao Zhang       y += starty*bs;                                                                                        \
241*cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
242*cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
243*cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
244*cd620004SJunchao Zhang             OpApply(Op,y[i*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]);                                                \
245*cd620004SJunchao Zhang     } else {                                                                                                 \
246*cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
247*cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
248*cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
249*cd620004SJunchao Zhang             OpApply(Op,y[idy[i]*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]);                                           \
250*cd620004SJunchao Zhang     }                                                                                                        \
251*cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
252*cd620004SJunchao Zhang   }
253*cd620004SJunchao Zhang 
254*cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
255*cd620004SJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,const PetscInt *rootindices,void *rootdata,PetscInt leafstart,const PetscInt *leafindices,const void *leafdata,void *leafupdate) \
256*cd620004SJunchao Zhang   {                                                                                                          \
257*cd620004SJunchao Zhang     Type           *x = (Type*)rootdata,*y2 = (Type*)leafupdate;                                             \
258*cd620004SJunchao Zhang     const Type     *y = (const Type*)leafdata;                                                               \
259*cd620004SJunchao Zhang     PetscInt       i,j,k,bs = link->bs;                                                                      \
260*cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
261*cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
262*cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
263*cd620004SJunchao Zhang     if (!rootindices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use FetchAndOp instead of FetchAndOpLocal");\
264*cd620004SJunchao Zhang     if (!leafindices) {                                                                                      \
265*cd620004SJunchao Zhang       y += leafstart*bs; y2 += leafstart*bs;                                                                 \
266*cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
267*cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
268*cd620004SJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
269*cd620004SJunchao Zhang             y2[i*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k];                                                 \
270*cd620004SJunchao Zhang             OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[i*MBS+j*BS+k]);                                        \
27140e23c03SJunchao Zhang           }                                                                                                  \
272b23bfdefSJunchao Zhang     } else {                                                                                                 \
273*cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
274b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
275b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
276*cd620004SJunchao Zhang             y2[leafindices[i]*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k];                                    \
277*cd620004SJunchao Zhang             OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[leafindices[i]*MBS+j*BS+k]);                           \
27840e23c03SJunchao Zhang           }                                                                                                  \
27940e23c03SJunchao Zhang     }                                                                                                        \
28040e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
28140e23c03SJunchao Zhang   }
28240e23c03SJunchao Zhang 
28340e23c03SJunchao Zhang 
284b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
285b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ)                                                      \
286b23bfdefSJunchao Zhang   DEF_PackFunc(Type,BS,EQ)                                                        \
287*cd620004SJunchao Zhang   DEF_UnpackFunc(Type,BS,EQ)                                                      \
288*cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
289*cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
290eb02082bSJunchao Zhang     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
291eb02082bSJunchao Zhang     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
292*cd620004SJunchao Zhang     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
29340e23c03SJunchao Zhang   }
29440e23c03SJunchao Zhang 
295b23bfdefSJunchao Zhang /* Add, Mult ops */
296b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ)                                                       \
297*cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
298*cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
299*cd620004SJunchao Zhang   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
300*cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
301*cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
302*cd620004SJunchao Zhang   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
303*cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
304eb02082bSJunchao Zhang     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
305eb02082bSJunchao Zhang     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
306eb02082bSJunchao Zhang     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
307*cd620004SJunchao Zhang     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
308*cd620004SJunchao Zhang     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
309*cd620004SJunchao Zhang     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
31040e23c03SJunchao Zhang   }
31140e23c03SJunchao Zhang 
312b23bfdefSJunchao Zhang /* Max, Min ops */
313b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ)                                                       \
314*cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
315*cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
316*cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
317*cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
318*cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
319eb02082bSJunchao Zhang     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
320eb02082bSJunchao Zhang     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
321*cd620004SJunchao Zhang     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
322*cd620004SJunchao Zhang     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
323b23bfdefSJunchao Zhang   }
324b23bfdefSJunchao Zhang 
325b23bfdefSJunchao Zhang /* Logical ops.
326*cd620004SJunchao Zhang   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
32740e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
32840e23c03SJunchao Zhang  */
329b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ)                                                       \
330*cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
331*cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
332*cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
333*cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
334*cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
335*cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
336*cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
337eb02082bSJunchao Zhang     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
338eb02082bSJunchao Zhang     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
339eb02082bSJunchao Zhang     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
340*cd620004SJunchao Zhang     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
341*cd620004SJunchao Zhang     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
342*cd620004SJunchao Zhang     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
34340e23c03SJunchao Zhang   }
34440e23c03SJunchao Zhang 
345b23bfdefSJunchao Zhang /* Bitwise ops */
346b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ)                                                       \
347*cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
348*cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
349*cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
350*cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
351*cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
352*cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
353*cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
354eb02082bSJunchao Zhang     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
355eb02082bSJunchao Zhang     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
356eb02082bSJunchao Zhang     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
357*cd620004SJunchao Zhang     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
358*cd620004SJunchao Zhang     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
359*cd620004SJunchao Zhang     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
36040e23c03SJunchao Zhang   }
36140e23c03SJunchao Zhang 
362*cd620004SJunchao Zhang /* Maxloc, Minloc ops */
363*cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ)                                                      \
364*cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
365*cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
366*cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
367*cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
368*cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
369*cd620004SJunchao Zhang     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
370*cd620004SJunchao Zhang     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
371*cd620004SJunchao Zhang     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
372*cd620004SJunchao Zhang     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
37340e23c03SJunchao Zhang   }
37440e23c03SJunchao Zhang 
375b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ)                                               \
376b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
377b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
378b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
379b23bfdefSJunchao Zhang   DEF_Log(Type,BS,EQ)                                                             \
380b23bfdefSJunchao Zhang   DEF_Bit(Type,BS,EQ)                                                             \
381*cd620004SJunchao Zhang   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
382b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
383b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
384b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
385b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
386b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
38740e23c03SJunchao Zhang   }
38840e23c03SJunchao Zhang 
389b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ)                                                  \
390b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
391b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
392b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
393*cd620004SJunchao Zhang   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
394b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
395b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
396b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
397b23bfdefSJunchao Zhang   }
39840e23c03SJunchao Zhang 
39940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
400b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ)                                               \
401b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
402b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
403*cd620004SJunchao Zhang   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
404b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
405b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
406b23bfdefSJunchao Zhang   }
40740e23c03SJunchao Zhang #endif
408b23bfdefSJunchao Zhang 
409b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ)                                                  \
410b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
411*cd620004SJunchao Zhang   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
412b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
413b23bfdefSJunchao Zhang   }
414b23bfdefSJunchao Zhang 
415b23bfdefSJunchao Zhang /* Maxloc, Minloc */
416*cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ)                                                  \
417*cd620004SJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
418*cd620004SJunchao Zhang   DEF_Xloc(Type,BS,EQ)                                                            \
419*cd620004SJunchao Zhang   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
420*cd620004SJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
421*cd620004SJunchao Zhang     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
422b23bfdefSJunchao Zhang   }
423b23bfdefSJunchao Zhang 
424b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
425b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
426b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
427b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
428b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
429b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
430b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
431b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
432b23bfdefSJunchao Zhang 
433b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
434b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1)
435b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1)
436b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1)
437b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1)
438b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0)
439b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0)
440b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0)
441b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0)
442b23bfdefSJunchao Zhang #endif
443b23bfdefSJunchao Zhang 
444b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */
445b23bfdefSJunchao Zhang typedef signed char SignedChar;
446b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1)
447b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1)
448b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1)
449b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1)
450b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0)
451b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0)
452b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0)
453b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0)
454b23bfdefSJunchao Zhang 
455b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar;
456b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1)
457b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1)
458b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1)
459b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1)
460b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0)
461b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0)
462b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0)
463b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0)
464b23bfdefSJunchao Zhang 
465b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1)
466b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1)
467b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1)
468b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1)
469b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0)
470b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0)
471b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0)
472b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0)
473b23bfdefSJunchao Zhang 
474b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
475b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1)
476b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1)
477b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1)
478b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1)
479b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0)
480b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0)
481b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0)
482b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0)
483b23bfdefSJunchao Zhang #endif
484b23bfdefSJunchao Zhang 
485*cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2
486*cd620004SJunchao Zhang typedef struct {int u; int i;}           PairType(int,int);
487*cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
488*cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1)
489*cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1)
490b23bfdefSJunchao Zhang 
491b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */
492b23bfdefSJunchao Zhang DEF_DumbType(char,1,1)
493b23bfdefSJunchao Zhang DEF_DumbType(char,2,1)
494b23bfdefSJunchao Zhang DEF_DumbType(char,4,1)
495b23bfdefSJunchao Zhang DEF_DumbType(char,1,0)
496b23bfdefSJunchao Zhang DEF_DumbType(char,2,0)
497b23bfdefSJunchao Zhang DEF_DumbType(char,4,0)
498b23bfdefSJunchao Zhang 
499eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
500b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1)
501b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1)
502b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1)
503b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1)
504b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0)
505b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0)
506b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0)
507b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0)
50840e23c03SJunchao Zhang 
50940e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
51040e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
51140e23c03SJunchao Zhang {
51240e23c03SJunchao Zhang   int ierr;
51340e23c03SJunchao Zhang   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
51440e23c03SJunchao Zhang   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
51540e23c03SJunchao Zhang   return MPI_SUCCESS;
51640e23c03SJunchao Zhang }
51740e23c03SJunchao Zhang #endif
51840e23c03SJunchao Zhang 
519*cd620004SJunchao Zhang /*
520*cd620004SJunchao Zhang    The routine Creates a communication link for the given operation. It first looks up its link cache. If
521*cd620004SJunchao Zhang    there is a free & suitable one, it uses it. Otherwise it creates a new one.
522*cd620004SJunchao Zhang 
523*cd620004SJunchao Zhang    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
524*cd620004SJunchao Zhang    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
525*cd620004SJunchao Zhang    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
526*cd620004SJunchao Zhang    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
527*cd620004SJunchao Zhang 
528*cd620004SJunchao Zhang    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
529*cd620004SJunchao Zhang 
530*cd620004SJunchao Zhang    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
531*cd620004SJunchao Zhang 
532*cd620004SJunchao Zhang    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
533*cd620004SJunchao Zhang    need pack/unpack data.
534*cd620004SJunchao Zhang */
535*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
53640e23c03SJunchao Zhang {
53740e23c03SJunchao Zhang   PetscErrorCode    ierr;
538*cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
539*cd620004SJunchao Zhang   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
540*cd620004SJunchao Zhang   PetscSFLink       *p,link;
541*cd620004SJunchao Zhang   PetscSFDirection  direction;
542*cd620004SJunchao Zhang   MPI_Request       *reqs = NULL;
543*cd620004SJunchao Zhang   PetscBool         match,rootdirect[2],leafdirect[2];
544*cd620004SJunchao Zhang   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
545*cd620004SJunchao Zhang   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
546*cd620004SJunchao Zhang 
547*cd620004SJunchao Zhang   PetscFunctionBegin;
548*cd620004SJunchao Zhang   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
549*cd620004SJunchao Zhang 
550*cd620004SJunchao Zhang   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
551*cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
552*cd620004SJunchao Zhang     if (sfop == PETSCSF_BCAST) {
553*cd620004SJunchao Zhang       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
554*cd620004SJunchao Zhang       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
555*cd620004SJunchao Zhang     } else if (sfop == PETSCSF_REDUCE) {
556*cd620004SJunchao Zhang       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
557*cd620004SJunchao Zhang       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
558*cd620004SJunchao Zhang     } else { /* PETSCSF_FETCH */
559*cd620004SJunchao Zhang       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
560*cd620004SJunchao Zhang       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
561*cd620004SJunchao Zhang     }
562*cd620004SJunchao Zhang   }
563*cd620004SJunchao Zhang 
564*cd620004SJunchao Zhang   if (use_gpu_aware_mpi) {
565*cd620004SJunchao Zhang     rootmtype_mpi = rootmtype;
566*cd620004SJunchao Zhang     leafmtype_mpi = leafmtype;
567*cd620004SJunchao Zhang   } else {
568*cd620004SJunchao Zhang     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
569*cd620004SJunchao Zhang   }
570*cd620004SJunchao Zhang   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
571*cd620004SJunchao Zhang   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
572*cd620004SJunchao Zhang   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
573*cd620004SJunchao Zhang 
574*cd620004SJunchao Zhang   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
575*cd620004SJunchao Zhang   nrootreqs = bas->nrootreqs;
576*cd620004SJunchao Zhang   nleafreqs = sf->nleafreqs;
577*cd620004SJunchao Zhang 
578*cd620004SJunchao Zhang   /* Look for free links in cache */
579*cd620004SJunchao Zhang   for (p=&bas->avail; (link=*p); p=&link->next) {
580*cd620004SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
581*cd620004SJunchao Zhang     if (match) {
582*cd620004SJunchao Zhang       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
583*cd620004SJunchao Zhang          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
584*cd620004SJunchao Zhang       */
585*cd620004SJunchao Zhang       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
586*cd620004SJunchao Zhang         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
587*cd620004SJunchao Zhang         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
588*cd620004SJunchao Zhang         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
589*cd620004SJunchao Zhang       }
590*cd620004SJunchao Zhang       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
591*cd620004SJunchao Zhang         reqs = link->leafreqs[direction][leafmtype][1];
592*cd620004SJunchao Zhang         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
593*cd620004SJunchao Zhang         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
594*cd620004SJunchao Zhang       }
595*cd620004SJunchao Zhang       *p = link->next; /* Remove from available list */
596*cd620004SJunchao Zhang       goto found;
597*cd620004SJunchao Zhang     }
598*cd620004SJunchao Zhang   }
599*cd620004SJunchao Zhang 
600*cd620004SJunchao Zhang   ierr = PetscNew(&link);CHKERRQ(ierr);
601*cd620004SJunchao Zhang   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
602*cd620004SJunchao Zhang   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
603*cd620004SJunchao Zhang 
604*cd620004SJunchao Zhang   nreqs = (nrootreqs+nleafreqs)*8;
605*cd620004SJunchao Zhang   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
606*cd620004SJunchao Zhang   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
607*cd620004SJunchao Zhang 
608*cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Two communication directions */
609*cd620004SJunchao Zhang     for (j=0; j<2; j++) { /* Two memory types */
610*cd620004SJunchao Zhang       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
611*cd620004SJunchao Zhang         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
612*cd620004SJunchao Zhang         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
613*cd620004SJunchao Zhang       }
614*cd620004SJunchao Zhang     }
615*cd620004SJunchao Zhang   }
616*cd620004SJunchao Zhang 
617*cd620004SJunchao Zhang found:
618*cd620004SJunchao Zhang   if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);}
619*cd620004SJunchao Zhang 
620*cd620004SJunchao Zhang   /* Allocate buffers along root/leafdata */
621*cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
622*cd620004SJunchao Zhang     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
623*cd620004SJunchao Zhang     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
624*cd620004SJunchao Zhang     if (bas->rootbuflen[i]) {
625*cd620004SJunchao Zhang       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
626*cd620004SJunchao Zhang         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
627*cd620004SJunchao Zhang       } else { /* Have to have a separate rootbuf */
628*cd620004SJunchao Zhang         if (!link->rootbuf_alloc[i][rootmtype]) {
629*cd620004SJunchao Zhang           ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
630*cd620004SJunchao Zhang         }
631*cd620004SJunchao Zhang         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
632*cd620004SJunchao Zhang       }
633*cd620004SJunchao Zhang     }
634*cd620004SJunchao Zhang 
635*cd620004SJunchao Zhang     if (sf->leafbuflen[i]) {
636*cd620004SJunchao Zhang       if (leafdirect[i]) {
637*cd620004SJunchao Zhang         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
638*cd620004SJunchao Zhang       } else {
639*cd620004SJunchao Zhang         if (!link->leafbuf_alloc[i][leafmtype]) {
640*cd620004SJunchao Zhang           ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
641*cd620004SJunchao Zhang         }
642*cd620004SJunchao Zhang         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
643*cd620004SJunchao Zhang       }
644*cd620004SJunchao Zhang     }
645*cd620004SJunchao Zhang   }
646*cd620004SJunchao Zhang 
647*cd620004SJunchao Zhang   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
648*cd620004SJunchao Zhang   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
649*cd620004SJunchao Zhang     if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
650*cd620004SJunchao Zhang       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
651*cd620004SJunchao Zhang     }
652*cd620004SJunchao Zhang     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
653*cd620004SJunchao Zhang   }
654*cd620004SJunchao Zhang   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
655*cd620004SJunchao Zhang     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
656*cd620004SJunchao Zhang       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
657*cd620004SJunchao Zhang     }
658*cd620004SJunchao Zhang     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
659*cd620004SJunchao Zhang   }
660*cd620004SJunchao Zhang 
661*cd620004SJunchao Zhang   /* Set `current` state of the link. They may change between different SF invocations with the same link */
662*cd620004SJunchao Zhang   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
663*cd620004SJunchao Zhang     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
664*cd620004SJunchao Zhang     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
665*cd620004SJunchao Zhang   }
666*cd620004SJunchao Zhang 
667*cd620004SJunchao Zhang   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
668*cd620004SJunchao Zhang   link->leafdata = leafdata;
669*cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
670*cd620004SJunchao Zhang     link->rootdirect[i] = rootdirect[i];
671*cd620004SJunchao Zhang     link->leafdirect[i] = leafdirect[i];
672*cd620004SJunchao Zhang   }
673*cd620004SJunchao Zhang   link->rootdirect_mpi  = rootdirect_mpi;
674*cd620004SJunchao Zhang   link->leafdirect_mpi  = leafdirect_mpi;
675*cd620004SJunchao Zhang   link->rootmtype       = rootmtype;
676*cd620004SJunchao Zhang   link->leafmtype       = leafmtype;
677*cd620004SJunchao Zhang   link->rootmtype_mpi   = rootmtype_mpi;
678*cd620004SJunchao Zhang   link->leafmtype_mpi   = leafmtype_mpi;
679*cd620004SJunchao Zhang 
680*cd620004SJunchao Zhang   link->next            = bas->inuse;
681*cd620004SJunchao Zhang   bas->inuse            = link;
682*cd620004SJunchao Zhang   *mylink               = link;
683*cd620004SJunchao Zhang   PetscFunctionReturn(0);
684*cd620004SJunchao Zhang }
685*cd620004SJunchao Zhang 
686*cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
687*cd620004SJunchao Zhang    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
688*cd620004SJunchao Zhang */
689*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
690*cd620004SJunchao Zhang {
691*cd620004SJunchao Zhang   PetscErrorCode       ierr;
692*cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
693*cd620004SJunchao Zhang   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
694*cd620004SJunchao Zhang   const PetscInt       *rootoffset,*leafoffset;
695*cd620004SJunchao Zhang   PetscMPIInt          n;
696*cd620004SJunchao Zhang   MPI_Aint             disp;
697*cd620004SJunchao Zhang   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
698*cd620004SJunchao Zhang   MPI_Datatype         unit = link->unit;
699*cd620004SJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
700*cd620004SJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
701*cd620004SJunchao Zhang 
702*cd620004SJunchao Zhang   PetscFunctionBegin;
703*cd620004SJunchao Zhang   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
704*cd620004SJunchao Zhang   if (sf->persistent) {
705*cd620004SJunchao Zhang     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
706*cd620004SJunchao Zhang       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
707*cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
708*cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
709*cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
710*cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
711*cd620004SJunchao Zhang           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
712*cd620004SJunchao Zhang         }
713*cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
714*cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
715*cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
716*cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
717*cd620004SJunchao Zhang           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
718*cd620004SJunchao Zhang         }
719*cd620004SJunchao Zhang       }
720*cd620004SJunchao Zhang       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
721*cd620004SJunchao Zhang     }
722*cd620004SJunchao Zhang 
723*cd620004SJunchao Zhang     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
724*cd620004SJunchao Zhang       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
725*cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
726*cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
727*cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
728*cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
729*cd620004SJunchao Zhang           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
730*cd620004SJunchao Zhang         }
731*cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
732*cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
733*cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
734*cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
735*cd620004SJunchao Zhang           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
736*cd620004SJunchao Zhang         }
737*cd620004SJunchao Zhang       }
738*cd620004SJunchao Zhang       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
739*cd620004SJunchao Zhang     }
740*cd620004SJunchao Zhang   }
741*cd620004SJunchao Zhang   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
742*cd620004SJunchao Zhang   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
743*cd620004SJunchao Zhang   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
744*cd620004SJunchao Zhang   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
745*cd620004SJunchao Zhang   PetscFunctionReturn(0);
746*cd620004SJunchao Zhang }
747*cd620004SJunchao Zhang 
748*cd620004SJunchao Zhang 
749*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
750*cd620004SJunchao Zhang {
751*cd620004SJunchao Zhang   PetscErrorCode    ierr;
752*cd620004SJunchao Zhang   PetscSFLink       link,*p;
75340e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
75440e23c03SJunchao Zhang 
75540e23c03SJunchao Zhang   PetscFunctionBegin;
75640e23c03SJunchao Zhang   /* Look for types in cache */
75740e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
75840e23c03SJunchao Zhang     PetscBool match;
75940e23c03SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
760637e6665SJunchao Zhang     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
76140e23c03SJunchao Zhang       switch (cmode) {
76240e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
76340e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
76440e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
76540e23c03SJunchao Zhang       }
76640e23c03SJunchao Zhang       *mylink = link;
76740e23c03SJunchao Zhang       PetscFunctionReturn(0);
76840e23c03SJunchao Zhang     }
76940e23c03SJunchao Zhang   }
77040e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
77140e23c03SJunchao Zhang   PetscFunctionReturn(0);
77240e23c03SJunchao Zhang }
77340e23c03SJunchao Zhang 
774*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
77540e23c03SJunchao Zhang {
77640e23c03SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
77740e23c03SJunchao Zhang 
77840e23c03SJunchao Zhang   PetscFunctionBegin;
779637e6665SJunchao Zhang   (*link)->rootdata = NULL;
780637e6665SJunchao Zhang   (*link)->leafdata = NULL;
78140e23c03SJunchao Zhang   (*link)->next     = bas->avail;
78240e23c03SJunchao Zhang   bas->avail        = *link;
78340e23c03SJunchao Zhang   *link             = NULL;
78440e23c03SJunchao Zhang   PetscFunctionReturn(0);
78540e23c03SJunchao Zhang }
78640e23c03SJunchao Zhang 
787*cd620004SJunchao Zhang /* Destroy all links chained in 'avail' */
788*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
789eb02082bSJunchao Zhang {
790eb02082bSJunchao Zhang   PetscErrorCode    ierr;
791*cd620004SJunchao Zhang   PetscSFLink       link = *avail,next;
792*cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
793*cd620004SJunchao Zhang   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
794eb02082bSJunchao Zhang 
795eb02082bSJunchao Zhang   PetscFunctionBegin;
796eb02082bSJunchao Zhang   for (; link; link=next) {
797eb02082bSJunchao Zhang     next = link->next;
798eb02082bSJunchao Zhang     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
799*cd620004SJunchao Zhang     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
800eb02082bSJunchao Zhang       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
801eb02082bSJunchao Zhang     }
802eb02082bSJunchao Zhang     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
803*cd620004SJunchao Zhang     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
80451ccb202SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
805*cd620004SJunchao Zhang       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
806*cd620004SJunchao Zhang       ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
807eb02082bSJunchao Zhang       if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
808eb02082bSJunchao Zhang #endif
809*cd620004SJunchao Zhang       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
810*cd620004SJunchao Zhang       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
811*cd620004SJunchao Zhang     }
812eb02082bSJunchao Zhang     ierr = PetscFree(link);CHKERRQ(ierr);
813eb02082bSJunchao Zhang   }
814eb02082bSJunchao Zhang   *avail = NULL;
815eb02082bSJunchao Zhang   PetscFunctionReturn(0);
816eb02082bSJunchao Zhang }
817eb02082bSJunchao Zhang 
818*cd620004SJunchao Zhang #if defined(PETSC_USE_DEBUG)
8199d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
820*cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
8219d1c8addSJunchao Zhang {
8229d1c8addSJunchao Zhang   PetscErrorCode    ierr;
823*cd620004SJunchao Zhang   PetscSFLink       link,*p;
8249d1c8addSJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
8259d1c8addSJunchao Zhang   PetscBool         match;
8269d1c8addSJunchao Zhang 
8279d1c8addSJunchao Zhang   PetscFunctionBegin;
82818fb5014SJunchao Zhang   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
82918fb5014SJunchao Zhang      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
83018fb5014SJunchao Zhang   */
831637e6665SJunchao Zhang   if (rootdata || leafdata) {
8329d1c8addSJunchao Zhang     for (p=&bas->inuse; (link=*p); p=&link->next) {
8339d1c8addSJunchao Zhang       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
834*cd620004SJunchao Zhang       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
8359d1c8addSJunchao Zhang     }
83618fb5014SJunchao Zhang   }
8379d1c8addSJunchao Zhang   PetscFunctionReturn(0);
8389d1c8addSJunchao Zhang }
839*cd620004SJunchao Zhang #endif
8409d1c8addSJunchao Zhang 
841*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
84240e23c03SJunchao Zhang {
84340e23c03SJunchao Zhang   PetscErrorCode ierr;
844b23bfdefSJunchao Zhang   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
845b23bfdefSJunchao Zhang   PetscBool      is2Int,is2PetscInt;
84640e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
84740e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
848b23bfdefSJunchao Zhang   PetscInt       nPetscComplex=0;
84940e23c03SJunchao Zhang #endif
85040e23c03SJunchao Zhang 
85140e23c03SJunchao Zhang   PetscFunctionBegin;
852b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
853b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
854b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
855b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
856b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
857b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
85840e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
859b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
86040e23c03SJunchao Zhang #endif
86140e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
86240e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
863b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
86440e23c03SJunchao Zhang   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
8655ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
866b23bfdefSJunchao Zhang   link->bs = 1; /* default */
86740e23c03SJunchao Zhang 
868eb02082bSJunchao Zhang   if (is2Int) {
869*cd620004SJunchao Zhang     PackInit_PairType_int_int_1_1(link);
870eb02082bSJunchao Zhang     link->bs        = 1;
871eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(int);
8725ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
873eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
8745ad15460SJunchao Zhang     link->unit      = MPI_2INT;
875eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
876*cd620004SJunchao Zhang     PackInit_PairType_PetscInt_PetscInt_1_1(link);
877eb02082bSJunchao Zhang     link->bs        = 1;
878eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(PetscInt);
879eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
8805ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
8815ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
882eb02082bSJunchao Zhang   } else if (nPetscReal) {
883b23bfdefSJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
884b23bfdefSJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
885b23bfdefSJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
886b23bfdefSJunchao Zhang     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
887b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
888eb02082bSJunchao Zhang     link->unitbytes = nPetscReal*sizeof(PetscReal);
88940e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
8905ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
891b23bfdefSJunchao Zhang   } else if (nPetscInt) {
892b23bfdefSJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
893b23bfdefSJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
894b23bfdefSJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
895b23bfdefSJunchao Zhang     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
896b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
897eb02082bSJunchao Zhang     link->unitbytes = nPetscInt*sizeof(PetscInt);
898b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
8995ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
900b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
901b23bfdefSJunchao Zhang   } else if (nInt) {
902b23bfdefSJunchao Zhang     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
903b23bfdefSJunchao Zhang     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
904b23bfdefSJunchao Zhang     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
905b23bfdefSJunchao Zhang     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
906b23bfdefSJunchao Zhang     link->bs        = nInt;
907eb02082bSJunchao Zhang     link->unitbytes = nInt*sizeof(int);
908b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
9095ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
910b23bfdefSJunchao Zhang #endif
911b23bfdefSJunchao Zhang   } else if (nSignedChar) {
912b23bfdefSJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
913b23bfdefSJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
914b23bfdefSJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
915b23bfdefSJunchao Zhang     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
916b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
917eb02082bSJunchao Zhang     link->unitbytes = nSignedChar*sizeof(SignedChar);
918b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
9195ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
920b23bfdefSJunchao Zhang   }  else if (nUnsignedChar) {
921b23bfdefSJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
922b23bfdefSJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
923b23bfdefSJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
924b23bfdefSJunchao Zhang     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
925b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
926eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
927b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
9285ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
92940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
930b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
931b23bfdefSJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
932b23bfdefSJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
933b23bfdefSJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
934b23bfdefSJunchao Zhang     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
935b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
936eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
93740e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
9385ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
93940e23c03SJunchao Zhang #endif
94040e23c03SJunchao Zhang   } else {
941b23bfdefSJunchao Zhang     MPI_Aint lb,nbyte;
942b23bfdefSJunchao Zhang     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
94340e23c03SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
944eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
945eb02082bSJunchao Zhang       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
946eb02082bSJunchao Zhang       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
947eb02082bSJunchao Zhang       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
948eb02082bSJunchao Zhang       link->bs        = nbyte;
949b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
950b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
95140e23c03SJunchao Zhang     } else {
952eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
953eb02082bSJunchao Zhang       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
954eb02082bSJunchao Zhang       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
955eb02082bSJunchao Zhang       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
956eb02082bSJunchao Zhang       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
957eb02082bSJunchao Zhang       link->bs        = nInt;
958b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
95940e23c03SJunchao Zhang       link->basicunit = MPI_INT;
96040e23c03SJunchao Zhang     }
9615ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
96240e23c03SJunchao Zhang   }
963b23bfdefSJunchao Zhang 
9645ad15460SJunchao Zhang   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
96540e23c03SJunchao Zhang   PetscFunctionReturn(0);
96640e23c03SJunchao Zhang }
96740e23c03SJunchao Zhang 
968*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*))
96940e23c03SJunchao Zhang {
97040e23c03SJunchao Zhang   PetscFunctionBegin;
97140e23c03SJunchao Zhang   *UnpackAndOp = NULL;
972eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
973eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
974eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
975eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
976eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
977eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
978eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
979eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
980eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
981eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
982eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
983eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
984eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
985eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
986eb02082bSJunchao Zhang   }
987eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
988eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
989eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
990eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
991eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
992eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
993eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
994eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
995eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
996eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
997eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
998eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
999eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
1000eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
1001eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
1002eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1003eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
1004eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
1005eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
1006eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
1007eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
1008eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
1009eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
1010eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
1011eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
1012eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
1013eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
1014eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
1015eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
1016eb02082bSJunchao Zhang   }
1017eb02082bSJunchao Zhang #endif
101840e23c03SJunchao Zhang   PetscFunctionReturn(0);
101940e23c03SJunchao Zhang }
102040e23c03SJunchao Zhang 
1021*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*))
102240e23c03SJunchao Zhang {
102340e23c03SJunchao Zhang   PetscFunctionBegin;
1024*cd620004SJunchao Zhang   *ScatterAndOp = NULL;
1025eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
1026*cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
1027*cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
1028*cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
1029*cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
1030*cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
1031*cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
1032*cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
1033*cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
1034*cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
1035*cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
1036*cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
1037*cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
1038*cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
1039eb02082bSJunchao Zhang   }
1040eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1041eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
1042*cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
1043*cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
1044*cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
1045*cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1046*cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1047*cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1048*cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1049*cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1050*cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1051*cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1052*cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1053*cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1054*cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1055eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1056*cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1057*cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1058*cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1059*cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1060*cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1061*cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1062*cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1063*cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1064*cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1065*cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1066*cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1067*cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1068*cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1069eb02082bSJunchao Zhang   }
1070eb02082bSJunchao Zhang #endif
1071*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1072*cd620004SJunchao Zhang }
1073*cd620004SJunchao Zhang 
1074*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*))
1075*cd620004SJunchao Zhang {
1076*cd620004SJunchao Zhang   PetscFunctionBegin;
1077*cd620004SJunchao Zhang   *FetchAndOp = NULL;
1078*cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1079*cd620004SJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1080*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1081*cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1082*cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1083*cd620004SJunchao Zhang #endif
1084*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1085*cd620004SJunchao Zhang }
1086*cd620004SJunchao Zhang 
1087*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*))
1088*cd620004SJunchao Zhang {
1089*cd620004SJunchao Zhang   PetscFunctionBegin;
1090*cd620004SJunchao Zhang   *FetchAndOpLocal = NULL;
1091*cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1092*cd620004SJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1093*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1094*cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1095*cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1096*cd620004SJunchao Zhang #endif
1097*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1098*cd620004SJunchao Zhang }
1099*cd620004SJunchao Zhang 
1100*cd620004SJunchao Zhang /*=============================================================================
1101*cd620004SJunchao Zhang               A set of helper routines for Pack/Unpack/Scatter on GPUs
1102*cd620004SJunchao Zhang  ============================================================================*/
1103*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1104*cd620004SJunchao Zhang /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
1105*cd620004SJunchao Zhang    make sure the data is ready for packing.
1106*cd620004SJunchao Zhang  */
1107*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
1108*cd620004SJunchao Zhang {
1109*cd620004SJunchao Zhang   PetscFunctionBegin;
1110*cd620004SJunchao Zhang   if (sf->use_default_stream) PetscFunctionReturn(0);
1111*cd620004SJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {
1112*cd620004SJunchao Zhang     cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
1113*cd620004SJunchao Zhang   }
1114*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1115*cd620004SJunchao Zhang }
1116*cd620004SJunchao Zhang 
1117*cd620004SJunchao Zhang /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
1118*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
1119*cd620004SJunchao Zhang {
1120*cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1121*cd620004SJunchao Zhang 
1122*cd620004SJunchao Zhang   PetscFunctionBegin;
1123*cd620004SJunchao Zhang   /* rootdata is on device && send non-empty to remote && (we called a packing kernel || we async-copied rootdata from device to host) */
1124*cd620004SJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && bas->rootbuflen[PETSCSF_REMOTE] && (!link->rootdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi)) {
1125*cd620004SJunchao Zhang     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1126*cd620004SJunchao Zhang   }
1127*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1128*cd620004SJunchao Zhang }
1129*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
1130*cd620004SJunchao Zhang {
1131*cd620004SJunchao Zhang   PetscFunctionBegin;
1132*cd620004SJunchao Zhang   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->leafbuflen[PETSCSF_REMOTE] && (!link->leafdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi)) {
1133*cd620004SJunchao Zhang     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1134*cd620004SJunchao Zhang   }
1135*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1136*cd620004SJunchao Zhang }
1137*cd620004SJunchao Zhang 
1138*cd620004SJunchao Zhang /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
1139*cd620004SJunchao Zhang    does not know which stream the callers will use.
1140*cd620004SJunchao Zhang */
1141*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
1142*cd620004SJunchao Zhang {
1143*cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1144*cd620004SJunchao Zhang 
1145*cd620004SJunchao Zhang   PetscFunctionBegin;
1146*cd620004SJunchao Zhang   if (sf->use_default_stream) PetscFunctionReturn(0);
1147*cd620004SJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (bas->rootbuflen[PETSCSF_LOCAL] || bas->rootbuflen[PETSCSF_REMOTE])) {
1148*cd620004SJunchao Zhang     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1149*cd620004SJunchao Zhang   }
1150*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1151*cd620004SJunchao Zhang }
1152*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
1153*cd620004SJunchao Zhang {
1154*cd620004SJunchao Zhang   PetscFunctionBegin;
1155*cd620004SJunchao Zhang   if (sf->use_default_stream) PetscFunctionReturn(0);
1156*cd620004SJunchao Zhang   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (sf->leafbuflen[PETSCSF_LOCAL] || sf->leafbuflen[PETSCSF_REMOTE])) {
1157*cd620004SJunchao Zhang     cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1158*cd620004SJunchao Zhang   }
1159*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1160*cd620004SJunchao Zhang }
1161*cd620004SJunchao Zhang 
1162*cd620004SJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
1163*cd620004SJunchao Zhang    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
1164*cd620004SJunchao Zhang */
1165*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1166*cd620004SJunchao Zhang {
1167*cd620004SJunchao Zhang   PetscErrorCode ierr;
1168*cd620004SJunchao Zhang   cudaError_t    cerr;
1169*cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1170*cd620004SJunchao Zhang 
1171*cd620004SJunchao Zhang   PetscFunctionBegin;
1172*cd620004SJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
1173*cd620004SJunchao Zhang     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1174*cd620004SJunchao Zhang     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1175*cd620004SJunchao Zhang     size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
1176*cd620004SJunchao Zhang     if (device2host) {
1177*cd620004SJunchao Zhang       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1178*cd620004SJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1179*cd620004SJunchao Zhang     } else {
1180*cd620004SJunchao Zhang       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1181*cd620004SJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1182*cd620004SJunchao Zhang     }
1183*cd620004SJunchao Zhang   }
1184*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1185*cd620004SJunchao Zhang }
1186*cd620004SJunchao Zhang 
1187*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1188*cd620004SJunchao Zhang {
1189*cd620004SJunchao Zhang   PetscErrorCode ierr;
1190*cd620004SJunchao Zhang   cudaError_t    cerr;
1191*cd620004SJunchao Zhang 
1192*cd620004SJunchao Zhang   PetscFunctionBegin;
1193*cd620004SJunchao Zhang   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
1194*cd620004SJunchao Zhang     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1195*cd620004SJunchao Zhang     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1196*cd620004SJunchao Zhang     size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
1197*cd620004SJunchao Zhang     if (device2host) {
1198*cd620004SJunchao Zhang       cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1199*cd620004SJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
1200*cd620004SJunchao Zhang     } else {
1201*cd620004SJunchao Zhang       cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1202*cd620004SJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
1203*cd620004SJunchao Zhang     }
1204*cd620004SJunchao Zhang   }
1205*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1206*cd620004SJunchao Zhang }
1207*cd620004SJunchao Zhang #else
1208*cd620004SJunchao Zhang 
1209*cd620004SJunchao Zhang #define PetscSFLinkSyncDeviceBeforePackData(a,b)                0
1210*cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterPackRootData(a,b)             0
1211*cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterPackLeafData(a,b)             0
1212*cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b)           0
1213*cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b)           0
1214*cd620004SJunchao Zhang #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1215*cd620004SJunchao Zhang #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1216*cd620004SJunchao Zhang 
1217*cd620004SJunchao Zhang #endif
1218*cd620004SJunchao Zhang 
1219*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1220*cd620004SJunchao Zhang {
1221*cd620004SJunchao Zhang   PetscErrorCode ierr;
1222*cd620004SJunchao Zhang   PetscLogDouble flops;
1223*cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1224*cd620004SJunchao Zhang 
1225*cd620004SJunchao Zhang 
1226*cd620004SJunchao Zhang   PetscFunctionBegin;
1227*cd620004SJunchao Zhang   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1228*cd620004SJunchao Zhang     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1229*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1230*cd620004SJunchao Zhang     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1231*cd620004SJunchao Zhang #endif
1232*cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1233*cd620004SJunchao Zhang   }
1234*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1235*cd620004SJunchao Zhang }
1236*cd620004SJunchao Zhang 
1237*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1238*cd620004SJunchao Zhang {
1239*cd620004SJunchao Zhang   PetscLogDouble flops;
1240*cd620004SJunchao Zhang   PetscErrorCode ierr;
1241*cd620004SJunchao Zhang 
1242*cd620004SJunchao Zhang   PetscFunctionBegin;
1243*cd620004SJunchao Zhang   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1244*cd620004SJunchao Zhang     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1245*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1246*cd620004SJunchao Zhang     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1247*cd620004SJunchao Zhang #endif
1248*cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1249*cd620004SJunchao Zhang   }
1250*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1251*cd620004SJunchao Zhang }
1252*cd620004SJunchao Zhang 
1253*cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1254*cd620004SJunchao Zhang   Input Arguments:
1255*cd620004SJunchao Zhang   +sf      - The StarForest
1256*cd620004SJunchao Zhang   .link    - The link
1257*cd620004SJunchao Zhang   .count   - Number of entries to unpack
1258*cd620004SJunchao Zhang   .start   - The first index, significent when indices=NULL
1259*cd620004SJunchao Zhang   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1260*cd620004SJunchao Zhang   .buf     - A contiguous buffer to unpack from
1261*cd620004SJunchao Zhang   -op      - Operation after unpack
1262*cd620004SJunchao Zhang 
1263*cd620004SJunchao Zhang   Output Arguments:
1264*cd620004SJunchao Zhang   .data    - The data to unpack to
1265*cd620004SJunchao Zhang */
1266*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1267*cd620004SJunchao Zhang {
1268*cd620004SJunchao Zhang   PetscFunctionBegin;
1269*cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1270*cd620004SJunchao Zhang   {
1271*cd620004SJunchao Zhang     PetscErrorCode ierr;
1272*cd620004SJunchao Zhang     PetscInt       i;
1273*cd620004SJunchao Zhang     PetscMPIInt    n;
1274*cd620004SJunchao Zhang     if (indices) {
1275*cd620004SJunchao Zhang       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1276*cd620004SJunchao Zhang          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1277*cd620004SJunchao Zhang       */
1278*cd620004SJunchao Zhang       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
1279*cd620004SJunchao Zhang     } else {
1280*cd620004SJunchao Zhang       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1281*cd620004SJunchao Zhang       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr);
1282*cd620004SJunchao Zhang     }
1283*cd620004SJunchao Zhang   }
1284*cd620004SJunchao Zhang #else
1285*cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1286*cd620004SJunchao Zhang #endif
1287*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1288*cd620004SJunchao Zhang }
1289*cd620004SJunchao Zhang 
1290*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata,MPI_Op op)
1291*cd620004SJunchao Zhang {
1292*cd620004SJunchao Zhang   PetscFunctionBegin;
1293*cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1294*cd620004SJunchao Zhang   {
1295*cd620004SJunchao Zhang     PetscErrorCode ierr;
1296*cd620004SJunchao Zhang     PetscInt       i,disp;
1297*cd620004SJunchao Zhang     for (i=0; i<count; i++) {
1298*cd620004SJunchao Zhang       disp = idy? idy[i] : starty + i;
1299*cd620004SJunchao Zhang       ierr = MPI_Reduce_local((const char*)xdata+idx[i]*link->unitbytes,(char*)ydata+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);
1300*cd620004SJunchao Zhang     }
1301*cd620004SJunchao Zhang   }
1302*cd620004SJunchao Zhang #else
1303*cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1304*cd620004SJunchao Zhang #endif
1305*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1306*cd620004SJunchao Zhang }
1307*cd620004SJunchao Zhang 
1308*cd620004SJunchao Zhang /*=============================================================================
1309*cd620004SJunchao Zhang               Pack/Unpack/Fetch/Scatter routines
1310*cd620004SJunchao Zhang  ============================================================================*/
1311*cd620004SJunchao Zhang 
1312*cd620004SJunchao Zhang /* Pack rootdata to rootbuf
1313*cd620004SJunchao Zhang   Input Arguments:
1314*cd620004SJunchao Zhang   + sf       - The SF this packing works on.
1315*cd620004SJunchao Zhang   . link     - It gives the memtype of the roots and also provides root buffer.
1316*cd620004SJunchao Zhang   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1317*cd620004SJunchao Zhang   - rootdata - Where to read the roots.
1318*cd620004SJunchao Zhang 
1319*cd620004SJunchao Zhang   Notes:
1320*cd620004SJunchao Zhang   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1321*cd620004SJunchao Zhang   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1322*cd620004SJunchao Zhang  */
1323*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1324*cd620004SJunchao Zhang {
1325*cd620004SJunchao Zhang   PetscErrorCode   ierr;
1326*cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1327*cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1328*cd620004SJunchao Zhang   PetscInt         count,start;
1329*cd620004SJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL;
1330*cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1331*cd620004SJunchao Zhang 
1332*cd620004SJunchao Zhang   PetscFunctionBegin;
1333*cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1334*cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1335*cd620004SJunchao Zhang   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1336*cd620004SJunchao Zhang     ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1337*cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1338*cd620004SJunchao Zhang     ierr = (*Pack)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1339*cd620004SJunchao Zhang   }
1340*cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1341*cd620004SJunchao Zhang     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1342*cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1343*cd620004SJunchao Zhang   }
1344*cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1345*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1346*cd620004SJunchao Zhang }
1347*cd620004SJunchao Zhang 
1348*cd620004SJunchao Zhang /* Pack leafdata to leafbuf */
1349*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1350*cd620004SJunchao Zhang {
1351*cd620004SJunchao Zhang   PetscErrorCode   ierr;
1352*cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1353*cd620004SJunchao Zhang   PetscInt         count,start;
1354*cd620004SJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL;
1355*cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1356*cd620004SJunchao Zhang 
1357*cd620004SJunchao Zhang   PetscFunctionBegin;
1358*cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1359*cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1360*cd620004SJunchao Zhang   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1361*cd620004SJunchao Zhang     ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr);
1362*cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1363*cd620004SJunchao Zhang     ierr = (*Pack)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1364*cd620004SJunchao Zhang   }
1365*cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1366*cd620004SJunchao Zhang     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1367*cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1368*cd620004SJunchao Zhang   }
1369*cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1370*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1371*cd620004SJunchao Zhang }
1372*cd620004SJunchao Zhang 
1373*cd620004SJunchao Zhang /* Unpack rootbuf to rootdata */
1374*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1375*cd620004SJunchao Zhang {
1376*cd620004SJunchao Zhang   PetscErrorCode   ierr;
1377*cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1378*cd620004SJunchao Zhang   PetscInt         count,start;
1379*cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1380*cd620004SJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1381*cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1382*cd620004SJunchao Zhang 
1383*cd620004SJunchao Zhang   PetscFunctionBegin;
1384*cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1385*cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1386*cd620004SJunchao Zhang   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1387*cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1388*cd620004SJunchao Zhang     if (UnpackAndOp) {
1389*cd620004SJunchao Zhang       ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1390*cd620004SJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1391*cd620004SJunchao Zhang     } else {
1392*cd620004SJunchao Zhang       ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1393*cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1394*cd620004SJunchao Zhang     }
1395*cd620004SJunchao Zhang   }
1396*cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1397*cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1398*cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1399*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1400*cd620004SJunchao Zhang }
1401*cd620004SJunchao Zhang 
1402*cd620004SJunchao Zhang /* Unpack leafbuf to leafdata */
1403*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1404*cd620004SJunchao Zhang {
1405*cd620004SJunchao Zhang   PetscErrorCode   ierr;
1406*cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1407*cd620004SJunchao Zhang   PetscInt         count,start;
1408*cd620004SJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1409*cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1410*cd620004SJunchao Zhang 
1411*cd620004SJunchao Zhang   PetscFunctionBegin;
1412*cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1413*cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1414*cd620004SJunchao Zhang   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1415*cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1416*cd620004SJunchao Zhang     if (UnpackAndOp) {
1417*cd620004SJunchao Zhang       ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr);
1418*cd620004SJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1419*cd620004SJunchao Zhang     } else {
1420*cd620004SJunchao Zhang       ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&leafindices);CHKERRQ(ierr);
1421*cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1422*cd620004SJunchao Zhang     }
1423*cd620004SJunchao Zhang   }
1424*cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1425*cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1426*cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1427*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1428*cd620004SJunchao Zhang }
1429*cd620004SJunchao Zhang 
1430*cd620004SJunchao Zhang /* FetchAndOp rootdata with rootbuf */
1431*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1432*cd620004SJunchao Zhang {
1433*cd620004SJunchao Zhang   PetscErrorCode     ierr;
1434*cd620004SJunchao Zhang   const PetscInt     *rootindices = NULL;
1435*cd620004SJunchao Zhang   PetscInt           count,start;
1436*cd620004SJunchao Zhang   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1437*cd620004SJunchao Zhang   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*) = NULL;
1438*cd620004SJunchao Zhang   PetscMemType       rootmtype = link->rootmtype;
1439*cd620004SJunchao Zhang 
1440*cd620004SJunchao Zhang   PetscFunctionBegin;
1441*cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1442*cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1443*cd620004SJunchao Zhang   if (bas->rootbuflen[scope]) {
1444*cd620004SJunchao Zhang     /* Do FetchAndOp on rootdata with rootbuf */
1445*cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1446*cd620004SJunchao Zhang     ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr);
1447*cd620004SJunchao Zhang     ierr = (*FetchAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1448*cd620004SJunchao Zhang   }
1449*cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1450*cd620004SJunchao Zhang     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1451*cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1452*cd620004SJunchao Zhang   }
1453*cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1454*cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1455*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1456*cd620004SJunchao Zhang }
1457*cd620004SJunchao Zhang 
1458*cd620004SJunchao Zhang /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1459*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1460*cd620004SJunchao Zhang {
1461*cd620004SJunchao Zhang   PetscErrorCode       ierr;
1462*cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1463*cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1464*cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1465*cd620004SJunchao Zhang   PetscErrorCode       (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1466*cd620004SJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL;
1467*cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1468*cd620004SJunchao Zhang 
1469*cd620004SJunchao Zhang   PetscFunctionBegin;
1470*cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1471*cd620004SJunchao Zhang   if (rootmtype != leafmtype) { /* Uncommon case */
1472*cd620004SJunchao Zhang      /* The local communication has to go through pack and unpack */
1473*cd620004SJunchao Zhang     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
1474*cd620004SJunchao Zhang     ierr = PetscMemcpyWithMemType(leafmtype,rootmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1475*cd620004SJunchao Zhang     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1476*cd620004SJunchao Zhang   } else {
1477*cd620004SJunchao Zhang     if (bas->rootcontig[PETSCSF_LOCAL]) { /* If root indices are contiguous, Scatter becomes Unpack */
1478*cd620004SJunchao Zhang       ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr);
1479*cd620004SJunchao Zhang       rootdata = (const char*)rootdata + bas->rootstart[PETSCSF_LOCAL]*link->unitbytes; /* Make rootdata point to start of the buffer */
1480*cd620004SJunchao Zhang       if (UnpackAndOp) {
1481*cd620004SJunchao Zhang         ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr);
1482*cd620004SJunchao Zhang         ierr = (*UnpackAndOp)(link,count,leafstart,leafindices,sf->leafpackopt[PETSCSF_LOCAL],leafdata,rootdata);CHKERRQ(ierr);
1483*cd620004SJunchao Zhang       } else {
1484*cd620004SJunchao Zhang         ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr);
1485*cd620004SJunchao Zhang         ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootdata,op);CHKERRQ(ierr);
1486*cd620004SJunchao Zhang       }
1487*cd620004SJunchao Zhang     } else { /* ScatterAndOp */
1488*cd620004SJunchao Zhang       ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1489*cd620004SJunchao Zhang       if (ScatterAndOp) {
1490*cd620004SJunchao Zhang         ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1491*cd620004SJunchao Zhang         ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr);
1492*cd620004SJunchao Zhang         ierr = (*ScatterAndOp)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata);CHKERRQ(ierr);
1493*cd620004SJunchao Zhang       } else {
1494*cd620004SJunchao Zhang         ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,NULL,&rootindices);CHKERRQ(ierr);
1495*cd620004SJunchao Zhang         ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr);
1496*cd620004SJunchao Zhang         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1497*cd620004SJunchao Zhang       }
1498*cd620004SJunchao Zhang     }
1499*cd620004SJunchao Zhang   }
1500*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1501*cd620004SJunchao Zhang }
1502*cd620004SJunchao Zhang 
1503*cd620004SJunchao Zhang /* Reduce leafdata to rootdata locally */
1504*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1505*cd620004SJunchao Zhang {
1506*cd620004SJunchao Zhang   PetscErrorCode       ierr;
1507*cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1508*cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1509*cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1510*cd620004SJunchao Zhang   PetscErrorCode       (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL;
1511*cd620004SJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL;
1512*cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1513*cd620004SJunchao Zhang 
1514*cd620004SJunchao Zhang   PetscFunctionBegin;
1515*cd620004SJunchao Zhang   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1516*cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1517*cd620004SJunchao Zhang     /* The local communication has to go through pack and unpack */
1518*cd620004SJunchao Zhang     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
1519*cd620004SJunchao Zhang     ierr = PetscMemcpyWithMemType(rootmtype,leafmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1520*cd620004SJunchao Zhang     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1521*cd620004SJunchao Zhang   } else {
1522*cd620004SJunchao Zhang     if (sf->leafcontig[PETSCSF_LOCAL]) {
1523*cd620004SJunchao Zhang       /* If leaf indices are contiguous, Scatter becomes Unpack */
1524*cd620004SJunchao Zhang       ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr);
1525*cd620004SJunchao Zhang       leafdata = (const char*)leafdata + sf->leafstart[PETSCSF_LOCAL]*link->unitbytes; /* Make leafdata point to start of the buffer */
1526*cd620004SJunchao Zhang       if (UnpackAndOp) {
1527*cd620004SJunchao Zhang         ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1528*cd620004SJunchao Zhang         ierr = (*UnpackAndOp)(link,count,rootstart,rootindices,bas->rootpackopt[PETSCSF_LOCAL],rootdata,leafdata);CHKERRQ(ierr);
1529*cd620004SJunchao Zhang       } else {
1530*cd620004SJunchao Zhang         ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1531*cd620004SJunchao Zhang         ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafdata,op);CHKERRQ(ierr);
1532*cd620004SJunchao Zhang       }
1533*cd620004SJunchao Zhang     } else {
1534*cd620004SJunchao Zhang       ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1535*cd620004SJunchao Zhang       if (ScatterAndOp) {
1536*cd620004SJunchao Zhang         ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1537*cd620004SJunchao Zhang         ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr);
1538*cd620004SJunchao Zhang         ierr = (*ScatterAndOp)(link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata);CHKERRQ(ierr);
1539*cd620004SJunchao Zhang       } else {
1540*cd620004SJunchao Zhang         ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1541*cd620004SJunchao Zhang         ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,NULL,&leafindices);CHKERRQ(ierr);
1542*cd620004SJunchao Zhang         ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1543*cd620004SJunchao Zhang       }
1544*cd620004SJunchao Zhang     }
1545*cd620004SJunchao Zhang   }
1546*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1547*cd620004SJunchao Zhang }
1548*cd620004SJunchao Zhang 
1549*cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */
1550*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1551*cd620004SJunchao Zhang {
1552*cd620004SJunchao Zhang   PetscErrorCode       ierr;
1553*cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1554*cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1555*cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1556*cd620004SJunchao Zhang   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*) = NULL;
1557*cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1558*cd620004SJunchao Zhang 
1559*cd620004SJunchao Zhang   PetscFunctionBegin;
1560*cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1561*cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1562*cd620004SJunchao Zhang    /* The local communication has to go through pack and unpack */
1563*cd620004SJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1564*cd620004SJunchao Zhang   } else {
1565*cd620004SJunchao Zhang     ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr);
1566*cd620004SJunchao Zhang     ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr);
1567*cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1568*cd620004SJunchao Zhang     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1569*cd620004SJunchao Zhang   }
157040e23c03SJunchao Zhang   PetscFunctionReturn(0);
157140e23c03SJunchao Zhang }
157240e23c03SJunchao Zhang 
157340e23c03SJunchao Zhang /*
1574*cd620004SJunchao Zhang   Create per-rank pack/unpack optimizations based on indice patterns
157540e23c03SJunchao Zhang 
157640e23c03SJunchao Zhang    Input Parameters:
1577b23bfdefSJunchao Zhang   +  n       - Number of target ranks
1578eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1579b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
158040e23c03SJunchao Zhang 
158140e23c03SJunchao Zhang    Output Parameters:
1582*cd620004SJunchao Zhang   +  opt     - Pack optimizations. NULL if no optimizations.
158340e23c03SJunchao Zhang */
1584*cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
158540e23c03SJunchao Zhang {
158640e23c03SJunchao Zhang   PetscErrorCode ierr;
158740e23c03SJunchao Zhang   PetscInt       i,j,k,n_copies,tot_copies = 0,step;
1588b23bfdefSJunchao Zhang   PetscBool      strided,optimized = PETSC_FALSE;
158940e23c03SJunchao Zhang   PetscSFPackOpt opt;
159040e23c03SJunchao Zhang 
159140e23c03SJunchao Zhang   PetscFunctionBegin;
1592b23bfdefSJunchao Zhang   if (!n) {
1593b23bfdefSJunchao Zhang     *out = NULL;
1594b23bfdefSJunchao Zhang     PetscFunctionReturn(0);
1595b23bfdefSJunchao Zhang   }
1596b23bfdefSJunchao Zhang 
159740e23c03SJunchao Zhang   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
1598b23bfdefSJunchao Zhang   ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr);
1599b23bfdefSJunchao Zhang   ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr);
1600*cd620004SJunchao Zhang   /* Make opt->offset[] zero-based. If one calls this routine with non-zero offset[0], one should use packing routine in this way, Pack(count,idx+offset[0],packopt,...) */
1601*cd620004SJunchao Zhang   if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];}
1602b23bfdefSJunchao Zhang 
1603b23bfdefSJunchao Zhang   opt->n = n;
160440e23c03SJunchao Zhang 
1605fb61b9e4SStefano Zampini   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with multiple memcpy's ) */
160640e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor */
160740e23c03SJunchao Zhang     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
160840e23c03SJunchao Zhang     n_copies = 1;
160940e23c03SJunchao Zhang     for (j=offset[i]; j<offset[i+1]-1; j++) {
161040e23c03SJunchao Zhang       if (idx[j]+1 != idx[j+1]) n_copies++;
161140e23c03SJunchao Zhang     }
161240e23c03SJunchao Zhang     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
161340e23c03SJunchao Zhang        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
161440e23c03SJunchao Zhang      */
161540e23c03SJunchao Zhang     if ((offset[i+1]-offset[i])/n_copies >= 32) {
1616b23bfdefSJunchao Zhang       opt->type[i] = PETSCSF_PACKOPT_MULTICOPY;
1617b23bfdefSJunchao Zhang       optimized    = PETSC_TRUE;
161840e23c03SJunchao Zhang       tot_copies  += n_copies;
161940e23c03SJunchao Zhang     }
162040e23c03SJunchao Zhang   }
162140e23c03SJunchao Zhang 
162240e23c03SJunchao Zhang   /* Setup memcpy plan for each contiguous piece */
162340e23c03SJunchao Zhang   k    = 0; /* k-th copy */
1624b23bfdefSJunchao Zhang   ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
1625b23bfdefSJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor */
1626b23bfdefSJunchao Zhang     if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) {
162740e23c03SJunchao Zhang       n_copies           = 1;
1628b23bfdefSJunchao Zhang       opt->copy_start[k] = offset[i] - offset[0];
162940e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
163040e23c03SJunchao Zhang         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
163140e23c03SJunchao Zhang           n_copies++;
1632b23bfdefSJunchao Zhang           opt->copy_start[k+1] = j-offset[0]+1;
1633b23bfdefSJunchao Zhang           opt->copy_length[k]  = opt->copy_start[k+1] - opt->copy_start[k];
163440e23c03SJunchao Zhang           k++;
163540e23c03SJunchao Zhang         }
163640e23c03SJunchao Zhang       }
163740e23c03SJunchao Zhang       /* Set copy length of the last copy for this target */
1638b23bfdefSJunchao Zhang       opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k];
163940e23c03SJunchao Zhang       k++;
164040e23c03SJunchao Zhang     }
1641b23bfdefSJunchao Zhang     /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */
164240e23c03SJunchao Zhang     opt->copy_offset[i+1] = k;
164340e23c03SJunchao Zhang   }
164440e23c03SJunchao Zhang 
164540e23c03SJunchao Zhang   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
164640e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each remote */
1647b23bfdefSJunchao Zhang     if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
164840e23c03SJunchao Zhang       strided = PETSC_TRUE;
164940e23c03SJunchao Zhang       step    = idx[offset[i]+1] - idx[offset[i]];
165040e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
165140e23c03SJunchao Zhang         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
165240e23c03SJunchao Zhang       }
165340e23c03SJunchao Zhang       if (strided) {
1654b23bfdefSJunchao Zhang         opt->type[i]         = PETSCSF_PACKOPT_STRIDE;
165540e23c03SJunchao Zhang         opt->stride_step[i]  = step;
165640e23c03SJunchao Zhang         opt->stride_n[i]     = offset[i+1] - offset[i];
1657b23bfdefSJunchao Zhang         optimized            = PETSC_TRUE;
165840e23c03SJunchao Zhang       }
165940e23c03SJunchao Zhang     }
166040e23c03SJunchao Zhang   }
1661b23bfdefSJunchao Zhang   /* If no rank gets optimized, free arrays to save memory */
1662b23bfdefSJunchao Zhang   if (!optimized) {
1663b23bfdefSJunchao Zhang     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
1664b23bfdefSJunchao Zhang     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
166540e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
166640e23c03SJunchao Zhang     *out = NULL;
166740e23c03SJunchao Zhang   } else *out = opt;
166840e23c03SJunchao Zhang   PetscFunctionReturn(0);
166940e23c03SJunchao Zhang }
167040e23c03SJunchao Zhang 
1671*cd620004SJunchao Zhang PetscErrorCode PetscSFDestroyPackOpt(PetscSFPackOpt *out)
167240e23c03SJunchao Zhang {
167340e23c03SJunchao Zhang   PetscErrorCode ierr;
167440e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
167540e23c03SJunchao Zhang 
167640e23c03SJunchao Zhang   PetscFunctionBegin;
167740e23c03SJunchao Zhang   if (opt) {
1678b23bfdefSJunchao Zhang     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
1679b23bfdefSJunchao Zhang     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
168040e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
168140e23c03SJunchao Zhang     *out = NULL;
168240e23c03SJunchao Zhang   }
168340e23c03SJunchao Zhang   PetscFunctionReturn(0);
168440e23c03SJunchao Zhang }
1685*cd620004SJunchao Zhang 
1686*cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1687*cd620004SJunchao Zhang {
1688*cd620004SJunchao Zhang   PetscErrorCode ierr;
1689*cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1690*cd620004SJunchao Zhang   PetscInt       i,j;
1691*cd620004SJunchao Zhang 
1692*cd620004SJunchao Zhang   PetscFunctionBegin;
1693*cd620004SJunchao Zhang   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1694*cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Set defaults */
1695*cd620004SJunchao Zhang     sf->leafstart[i]   = 0;
1696*cd620004SJunchao Zhang     sf->leafcontig[i]  = PETSC_TRUE;
1697*cd620004SJunchao Zhang     sf->leafdups[i]    = PETSC_FALSE;
1698*cd620004SJunchao Zhang     bas->rootstart[i]  = 0;
1699*cd620004SJunchao Zhang     bas->rootcontig[i] = PETSC_TRUE;
1700*cd620004SJunchao Zhang     bas->rootdups[i]   = PETSC_FALSE;
1701*cd620004SJunchao Zhang   }
1702*cd620004SJunchao Zhang 
1703*cd620004SJunchao Zhang   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1704*cd620004SJunchao Zhang   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1705*cd620004SJunchao Zhang 
1706*cd620004SJunchao Zhang   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1707*cd620004SJunchao Zhang   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1708*cd620004SJunchao Zhang 
1709*cd620004SJunchao Zhang   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1710*cd620004SJunchao Zhang   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1711*cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1712*cd620004SJunchao Zhang   }
1713*cd620004SJunchao Zhang   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1714*cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1715*cd620004SJunchao Zhang   }
1716*cd620004SJunchao Zhang 
1717*cd620004SJunchao Zhang   /* If not, see if we can have per-rank optimizations by doing index analysis */
1718*cd620004SJunchao Zhang   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1719*cd620004SJunchao Zhang   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1720*cd620004SJunchao Zhang 
1721*cd620004SJunchao Zhang   /* Are root indices for self and remote contiguous? */
1722*cd620004SJunchao Zhang   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1723*cd620004SJunchao Zhang   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1724*cd620004SJunchao Zhang 
1725*cd620004SJunchao Zhang   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1726*cd620004SJunchao Zhang   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1727*cd620004SJunchao Zhang 
1728*cd620004SJunchao Zhang   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1729*cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1730*cd620004SJunchao Zhang   }
1731*cd620004SJunchao Zhang   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1732*cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1733*cd620004SJunchao Zhang   }
1734*cd620004SJunchao Zhang 
1735*cd620004SJunchao Zhang   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1736*cd620004SJunchao Zhang   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1737*cd620004SJunchao Zhang 
1738*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
1739*cd620004SJunchao Zhang     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1740*cd620004SJunchao Zhang   if (!sf->leafcontig[0])  {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1741*cd620004SJunchao Zhang   if (!sf->leafcontig[1])  {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1742*cd620004SJunchao Zhang   if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1743*cd620004SJunchao Zhang   if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1744*cd620004SJunchao Zhang #endif
1745*cd620004SJunchao Zhang 
1746*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1747*cd620004SJunchao Zhang }
1748*cd620004SJunchao Zhang 
1749*cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1750*cd620004SJunchao Zhang {
1751*cd620004SJunchao Zhang   PetscErrorCode ierr;
1752*cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1753*cd620004SJunchao Zhang   PetscInt       i;
1754*cd620004SJunchao Zhang 
1755*cd620004SJunchao Zhang   PetscFunctionBegin;
1756*cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1757*cd620004SJunchao Zhang     ierr = PetscSFDestroyPackOpt(&sf->leafpackopt[i]);CHKERRQ(ierr);
1758*cd620004SJunchao Zhang     ierr = PetscSFDestroyPackOpt(&bas->rootpackopt[i]);CHKERRQ(ierr);
1759*cd620004SJunchao Zhang   }
1760*cd620004SJunchao Zhang   PetscFunctionReturn(0);
1761*cd620004SJunchao Zhang }
1762