xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision a247e58cd33788cb93bf79dd07c27b63b5a63dbf)
120c24465SJunchao Zhang #include "petsc/private/sfimpl.h"
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
57fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
67fd2d3dbSJunchao Zhang 
7eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
8eb02082bSJunchao Zhang #include <cuda_runtime.h>
97fd2d3dbSJunchao Zhang #include <petsccublas.h>
10eb02082bSJunchao Zhang #endif
1140e23c03SJunchao Zhang /*
1240e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
1340e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1440e23c03SJunchao Zhang  */
1540e23c03SJunchao Zhang 
16cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d)  a##_##b##_##c##_##d
1740e23c03SJunchao Zhang 
18cd620004SJunchao Zhang /* Operations working like s += t */
19cd620004SJunchao Zhang #define OP_BINARY(op,s,t)   do {(s) = (s) op (t);  } while (0)      /* binary ops in the middle such as +, *, && etc. */
20cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0)      /* ops like a function, such as PetscMax, PetscMin */
21cd620004SJunchao Zhang #define OP_LXOR(op,s,t)     do {(s) = (!(s)) != (!(t));} while (0)  /* logical exclusive OR */
22cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t)   do {(s) = (t);} while (0)
23cd620004SJunchao Zhang /* Ref MPI MAXLOC */
24cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \
25cd620004SJunchao Zhang   do {                                       \
26cd620004SJunchao Zhang     if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
27cd620004SJunchao Zhang     else if (!((s).u op (t).u)) s = t;           \
28cd620004SJunchao Zhang   } while (0)
2940e23c03SJunchao Zhang 
3040e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
3140e23c03SJunchao Zhang 
3240e23c03SJunchao Zhang    Arguments of the macro:
33b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
34fcc7397dSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bsz.
35b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
3640e23c03SJunchao Zhang 
3740e23c03SJunchao Zhang    Arguments of the Pack routine:
38cd620004SJunchao Zhang    +count     Number of indices in idx[].
39fcc7397dSJunchao Zhang    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
40fcc7397dSJunchao Zhang    .opt       Per-pack optimization plan. NULL means no such plan.
41fcc7397dSJunchao Zhang    .idx       Indices of entries to packed.
42eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
43cd620004SJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
44cd620004SJunchao Zhang    -packed    Address of the packed data.
4540e23c03SJunchao Zhang  */
46b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \
47fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
48b23bfdefSJunchao Zhang   {                                                                                                          \
4940e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
50b23bfdefSJunchao Zhang     const Type     *u = (const Type*)unpacked,*u2;                                                           \
51b23bfdefSJunchao Zhang     Type           *p = (Type*)packed,*p2;                                                                   \
52fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
53fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
54b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
5540e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
56fcc7397dSJunchao Zhang     if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */       \
57fcc7397dSJunchao Zhang     else if (opt) { /* has optimizations available */                                                        \
58fcc7397dSJunchao Zhang       p2 = p;                                                                                                \
59fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
60fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
61fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
62fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
63fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
64fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
65fcc7397dSJunchao Zhang             ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr);                        \
66fcc7397dSJunchao Zhang             p2  += opt->dx[r]*MBS;                                                                           \
67fcc7397dSJunchao Zhang           }                                                                                                  \
68fcc7397dSJunchao Zhang       }                                                                                                      \
69fcc7397dSJunchao Zhang     } else {                                                                                                 \
70b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
71eb02082bSJunchao Zhang         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
72eb02082bSJunchao Zhang           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
73b23bfdefSJunchao Zhang             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
7440e23c03SJunchao Zhang     }                                                                                                        \
7540e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
7640e23c03SJunchao Zhang   }
7740e23c03SJunchao Zhang 
78cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
79cd620004SJunchao Zhang                 and inserts into a sparse array.
8040e23c03SJunchao Zhang 
8140e23c03SJunchao Zhang    Arguments:
82b23bfdefSJunchao Zhang   .Type       Type of the data
8340e23c03SJunchao Zhang   .BS         Block size for vectorization
84b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
8540e23c03SJunchao Zhang 
8640e23c03SJunchao Zhang   Notes:
8740e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
8840e23c03SJunchao Zhang  */
89cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ)               \
90fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
91b23bfdefSJunchao Zhang   {                                                                                                          \
9240e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
93b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
94fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
95fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
96fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
97b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
9840e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
99b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
100fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
101fcc7397dSJunchao Zhang       if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);}                                       \
102fcc7397dSJunchao Zhang     } else if (opt) { /* has optimizations available */                                                      \
103fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
104fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
105fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
106fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
107fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
108fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
109fcc7397dSJunchao Zhang             ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr);                         \
110fcc7397dSJunchao Zhang             p   += opt->dx[r]*MBS;                                                                           \
111fcc7397dSJunchao Zhang           }                                                                                                  \
112fcc7397dSJunchao Zhang       }                                                                                                      \
113fcc7397dSJunchao Zhang     } else {                                                                                                 \
114b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
115b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
116cd620004SJunchao Zhang           for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k];                                       \
11740e23c03SJunchao Zhang     }                                                                                                        \
11840e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
11940e23c03SJunchao Zhang   }
12040e23c03SJunchao Zhang 
121cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
12240e23c03SJunchao Zhang 
12340e23c03SJunchao Zhang    Arguments:
124cd620004SJunchao Zhang   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
125b23bfdefSJunchao Zhang   .Type       Type of the data
12640e23c03SJunchao Zhang   .BS         Block size for vectorization
127b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
128cd620004SJunchao Zhang   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
129cd620004SJunchao Zhang   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
13040e23c03SJunchao Zhang  */
131cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
132fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
133b23bfdefSJunchao Zhang   {                                                                                                          \
134cd620004SJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
135fcc7397dSJunchao Zhang     const Type     *p = (const Type*)packed;                                                                 \
136fcc7397dSJunchao Zhang     PetscInt       i,j,k,X,Y,r,bs=link->bs;                                                                  \
137fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */          \
138b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
13940e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
140b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
141fcc7397dSJunchao Zhang       u += start*MBS;                                                                                        \
142cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
143cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
144cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
145cd620004SJunchao Zhang             OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                     \
146fcc7397dSJunchao Zhang     } else if (opt) { /* idx[] has patterns */                                                               \
147fcc7397dSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
148fcc7397dSJunchao Zhang         u2 = u + opt->start[r]*MBS;                                                                          \
149fcc7397dSJunchao Zhang         X  = opt->X[r];                                                                                      \
150fcc7397dSJunchao Zhang         Y  = opt->Y[r];                                                                                      \
151fcc7397dSJunchao Zhang         for (k=0; k<opt->dz[r]; k++)                                                                         \
152fcc7397dSJunchao Zhang           for (j=0; j<opt->dy[r]; j++) {                                                                     \
153fcc7397dSJunchao Zhang             for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]);                         \
154fcc7397dSJunchao Zhang             p += opt->dx[r]*MBS;                                                                             \
155fcc7397dSJunchao Zhang           }                                                                                                  \
156fcc7397dSJunchao Zhang       }                                                                                                      \
157fcc7397dSJunchao Zhang     } else {                                                                                                 \
158cd620004SJunchao Zhang       for (i=0; i<count; i++)                                                                                \
159cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
160cd620004SJunchao Zhang           for (k=0; k<BS; k++)                                                                               \
161cd620004SJunchao Zhang             OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]);                                                \
162cd620004SJunchao Zhang     }                                                                                                        \
163cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
164cd620004SJunchao Zhang   }
165cd620004SJunchao Zhang 
166cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
167fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
168cd620004SJunchao Zhang   {                                                                                                          \
169fcc7397dSJunchao Zhang     Type           *u = (Type*)unpacked,*p = (Type*)packed,tmp;                                              \
170fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs=link->bs;                                                                    \
171fcc7397dSJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
172fcc7397dSJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
173cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
174fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
175fcc7397dSJunchao Zhang       r = (!idx ? start+i : idx[i])*MBS;                                                                     \
176fcc7397dSJunchao Zhang       l = i*MBS;                                                                                             \
177b23bfdefSJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
178b23bfdefSJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
179fcc7397dSJunchao Zhang           tmp = u[r+j*BS+k];                                                                                 \
180fcc7397dSJunchao Zhang           OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]);                                                               \
181fcc7397dSJunchao Zhang           p[l+j*BS+k] = tmp;                                                                                 \
182cd620004SJunchao Zhang         }                                                                                                    \
183cd620004SJunchao Zhang     }                                                                                                        \
184cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
185cd620004SJunchao Zhang   }
186cd620004SJunchao Zhang 
187cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
188fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
189cd620004SJunchao Zhang   {                                                                                                          \
190fcc7397dSJunchao Zhang     PetscErrorCode ierr;                                                                                     \
191fcc7397dSJunchao Zhang     const Type     *u = (const Type*)src;                                                                    \
192fcc7397dSJunchao Zhang     Type           *v = (Type*)dst;                                                                          \
193fcc7397dSJunchao Zhang     PetscInt       i,j,k,s,t,X,Y,bs = link->bs;                                                              \
194cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
195cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
196cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
197fcc7397dSJunchao Zhang     if (!srcIdx) { /* src is contiguous */                                                                   \
198fcc7397dSJunchao Zhang       u += srcStart*MBS;                                                                                     \
199fcc7397dSJunchao Zhang       ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr);  \
200fcc7397dSJunchao Zhang     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */                                       \
201fcc7397dSJunchao Zhang       u += srcOpt->start[0]*MBS;                                                                             \
202fcc7397dSJunchao Zhang       v += dstStart*MBS;                                                                                     \
203fcc7397dSJunchao Zhang       X  = srcOpt->X[0]; Y = srcOpt->Y[0];                                                                   \
204fcc7397dSJunchao Zhang       for (k=0; k<srcOpt->dz[0]; k++)                                                                        \
205fcc7397dSJunchao Zhang         for (j=0; j<srcOpt->dy[0]; j++) {                                                                    \
206fcc7397dSJunchao Zhang           for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]);                         \
207fcc7397dSJunchao Zhang           v += srcOpt->dx[0]*MBS;                                                                            \
208fcc7397dSJunchao Zhang         }                                                                                                    \
209fcc7397dSJunchao Zhang     } else { /* all other cases */                                                                           \
210fcc7397dSJunchao Zhang       for (i=0; i<count; i++) {                                                                              \
211fcc7397dSJunchao Zhang         s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS;                                                          \
212fcc7397dSJunchao Zhang         t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS;                                                          \
213cd620004SJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
214fcc7397dSJunchao Zhang           for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]);                                          \
215fcc7397dSJunchao Zhang       }                                                                                                      \
216cd620004SJunchao Zhang     }                                                                                                        \
217cd620004SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
218cd620004SJunchao Zhang   }
219cd620004SJunchao Zhang 
220cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
221fcc7397dSJunchao Zhang   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
222cd620004SJunchao Zhang   {                                                                                                          \
223fcc7397dSJunchao Zhang     Type           *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate;                                    \
224fcc7397dSJunchao Zhang     const Type     *ldata = (const Type*)leafdata;                                                           \
225fcc7397dSJunchao Zhang     PetscInt       i,j,k,r,l,bs = link->bs;                                                                  \
226cd620004SJunchao Zhang     const PetscInt M = (EQ) ? 1 : bs/BS;                                                                     \
227cd620004SJunchao Zhang     const PetscInt MBS = M*BS;                                                                               \
228cd620004SJunchao Zhang     PetscFunctionBegin;                                                                                      \
229fcc7397dSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
230fcc7397dSJunchao Zhang       r = (rootidx ? rootidx[i] : rootstart+i)*MBS;                                                          \
231fcc7397dSJunchao Zhang       l = (leafidx ? leafidx[i] : leafstart+i)*MBS;                                                          \
232cd620004SJunchao Zhang       for (j=0; j<M; j++)                                                                                    \
233cd620004SJunchao Zhang         for (k=0; k<BS; k++) {                                                                               \
234fcc7397dSJunchao Zhang           lupdate[l+j*BS+k] = rdata[r+j*BS+k];                                                               \
235fcc7397dSJunchao Zhang           OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]);                                                       \
23640e23c03SJunchao Zhang         }                                                                                                    \
23740e23c03SJunchao Zhang     }                                                                                                        \
23840e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
23940e23c03SJunchao Zhang   }
24040e23c03SJunchao Zhang 
241b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
242b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ)                                                      \
243b23bfdefSJunchao Zhang   DEF_PackFunc(Type,BS,EQ)                                                        \
244cd620004SJunchao Zhang   DEF_UnpackFunc(Type,BS,EQ)                                                      \
245cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN)                                 \
246cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) {              \
247eb02082bSJunchao Zhang     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);               \
248eb02082bSJunchao Zhang     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);               \
249cd620004SJunchao Zhang     link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ);              \
25040e23c03SJunchao Zhang   }
25140e23c03SJunchao Zhang 
252b23bfdefSJunchao Zhang /* Add, Mult ops */
253b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ)                                                       \
254cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Add, +,OP_BINARY)                                \
255cd620004SJunchao Zhang   DEF_UnpackAndOp    (Type,BS,EQ,Mult,*,OP_BINARY)                                \
256cd620004SJunchao Zhang   DEF_FetchAndOp     (Type,BS,EQ,Add, +,OP_BINARY)                                \
257cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Add, +,OP_BINARY)                                \
258cd620004SJunchao Zhang   DEF_ScatterAndOp   (Type,BS,EQ,Mult,*,OP_BINARY)                                \
259cd620004SJunchao Zhang   DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY)                                \
260cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) {               \
261eb02082bSJunchao Zhang     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd,    Type,BS,EQ);             \
262eb02082bSJunchao Zhang     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult,   Type,BS,EQ);             \
263eb02082bSJunchao Zhang     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd,     Type,BS,EQ);             \
264cd620004SJunchao Zhang     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd,   Type,BS,EQ);             \
265cd620004SJunchao Zhang     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult,  Type,BS,EQ);             \
266cd620004SJunchao Zhang     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ);             \
26740e23c03SJunchao Zhang   }
26840e23c03SJunchao Zhang 
269b23bfdefSJunchao Zhang /* Max, Min ops */
270b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ)                                                       \
271cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
272cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
273cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION)                           \
274cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION)                           \
275cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) {           \
276eb02082bSJunchao Zhang     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);               \
277eb02082bSJunchao Zhang     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);               \
278cd620004SJunchao Zhang     link->h_ScatterAndMax   = CPPJoin4(ScatterAndMax,  Type,BS,EQ);               \
279cd620004SJunchao Zhang     link->h_ScatterAndMin   = CPPJoin4(ScatterAndMin,  Type,BS,EQ);               \
280b23bfdefSJunchao Zhang   }
281b23bfdefSJunchao Zhang 
282b23bfdefSJunchao Zhang /* Logical ops.
283cd620004SJunchao Zhang   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
28440e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
28540e23c03SJunchao Zhang  */
286b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ)                                                       \
287cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
288cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
289cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
290cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY)                                  \
291cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY)                                  \
292cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR)                                   \
293cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) {           \
294eb02082bSJunchao Zhang     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND, Type,BS,EQ);                \
295eb02082bSJunchao Zhang     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR,  Type,BS,EQ);                \
296eb02082bSJunchao Zhang     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR, Type,BS,EQ);                \
297cd620004SJunchao Zhang     link->h_ScatterAndLAND  = CPPJoin4(ScatterAndLAND,Type,BS,EQ);                \
298cd620004SJunchao Zhang     link->h_ScatterAndLOR   = CPPJoin4(ScatterAndLOR, Type,BS,EQ);                \
299cd620004SJunchao Zhang     link->h_ScatterAndLXOR  = CPPJoin4(ScatterAndLXOR,Type,BS,EQ);                \
30040e23c03SJunchao Zhang   }
30140e23c03SJunchao Zhang 
302b23bfdefSJunchao Zhang /* Bitwise ops */
303b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ)                                                       \
304cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY)                                   \
305cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY)                                   \
306cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
307cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY)                                   \
308cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY)                                   \
309cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY)                                   \
310cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) {           \
311eb02082bSJunchao Zhang     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND, Type,BS,EQ);                \
312eb02082bSJunchao Zhang     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR,  Type,BS,EQ);                \
313eb02082bSJunchao Zhang     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR, Type,BS,EQ);                \
314cd620004SJunchao Zhang     link->h_ScatterAndBAND  = CPPJoin4(ScatterAndBAND,Type,BS,EQ);                \
315cd620004SJunchao Zhang     link->h_ScatterAndBOR   = CPPJoin4(ScatterAndBOR, Type,BS,EQ);                \
316cd620004SJunchao Zhang     link->h_ScatterAndBXOR  = CPPJoin4(ScatterAndBXOR,Type,BS,EQ);                \
31740e23c03SJunchao Zhang   }
31840e23c03SJunchao Zhang 
319cd620004SJunchao Zhang /* Maxloc, Minloc ops */
320cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ)                                                      \
321cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC)                                      \
322cd620004SJunchao Zhang   DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC)                                      \
323cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC)                                      \
324cd620004SJunchao Zhang   DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC)                                      \
325cd620004SJunchao Zhang   static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) {              \
326cd620004SJunchao Zhang     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type,BS,EQ);                \
327cd620004SJunchao Zhang     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type,BS,EQ);                \
328cd620004SJunchao Zhang     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ);                \
329cd620004SJunchao Zhang     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ);                \
33040e23c03SJunchao Zhang   }
33140e23c03SJunchao Zhang 
332b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ)                                               \
333b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
334b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
335b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
336b23bfdefSJunchao Zhang   DEF_Log(Type,BS,EQ)                                                             \
337b23bfdefSJunchao Zhang   DEF_Bit(Type,BS,EQ)                                                             \
338cd620004SJunchao Zhang   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) {       \
339b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
340b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
341b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
342b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                  \
343b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                  \
34440e23c03SJunchao Zhang   }
34540e23c03SJunchao Zhang 
346b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ)                                                  \
347b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
348b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
349b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                             \
350cd620004SJunchao Zhang   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) {          \
351b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
352b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
353b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                  \
354b23bfdefSJunchao Zhang   }
35540e23c03SJunchao Zhang 
35640e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
357b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ)                                               \
358b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
359b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                             \
360cd620004SJunchao Zhang   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) {       \
361b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
362b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                      \
363b23bfdefSJunchao Zhang   }
36440e23c03SJunchao Zhang #endif
365b23bfdefSJunchao Zhang 
366b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ)                                                  \
367b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
368cd620004SJunchao Zhang   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) {          \
369b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
370b23bfdefSJunchao Zhang   }
371b23bfdefSJunchao Zhang 
372b23bfdefSJunchao Zhang /* Maxloc, Minloc */
373cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ)                                                  \
374cd620004SJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                            \
375cd620004SJunchao Zhang   DEF_Xloc(Type,BS,EQ)                                                            \
376cd620004SJunchao Zhang   static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) {          \
377cd620004SJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                     \
378cd620004SJunchao Zhang     CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link);                                     \
379b23bfdefSJunchao Zhang   }
380b23bfdefSJunchao Zhang 
381b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
382b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
383b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
384b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
385b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
386b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
387b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
388b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
389b23bfdefSJunchao Zhang 
390b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
391b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1)
392b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1)
393b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1)
394b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1)
395b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0)
396b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0)
397b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0)
398b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0)
399b23bfdefSJunchao Zhang #endif
400b23bfdefSJunchao Zhang 
401b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */
402b23bfdefSJunchao Zhang typedef signed char SignedChar;
403b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1)
404b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1)
405b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1)
406b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1)
407b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0)
408b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0)
409b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0)
410b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0)
411b23bfdefSJunchao Zhang 
412b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar;
413b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1)
414b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1)
415b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1)
416b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1)
417b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0)
418b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0)
419b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0)
420b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0)
421b23bfdefSJunchao Zhang 
422b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1)
423b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1)
424b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1)
425b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1)
426b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0)
427b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0)
428b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0)
429b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0)
430b23bfdefSJunchao Zhang 
431b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
432b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1)
433b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1)
434b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1)
435b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1)
436b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0)
437b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0)
438b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0)
439b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0)
440b23bfdefSJunchao Zhang #endif
441b23bfdefSJunchao Zhang 
442cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2
443cd620004SJunchao Zhang typedef struct {int u; int i;}           PairType(int,int);
444cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
445cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1)
446cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1)
447b23bfdefSJunchao Zhang 
448b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */
449b23bfdefSJunchao Zhang DEF_DumbType(char,1,1)
450b23bfdefSJunchao Zhang DEF_DumbType(char,2,1)
451b23bfdefSJunchao Zhang DEF_DumbType(char,4,1)
452b23bfdefSJunchao Zhang DEF_DumbType(char,1,0)
453b23bfdefSJunchao Zhang DEF_DumbType(char,2,0)
454b23bfdefSJunchao Zhang DEF_DumbType(char,4,0)
455b23bfdefSJunchao Zhang 
456eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
457b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1)
458b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1)
459b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1)
460b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1)
461b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0)
462b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0)
463b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0)
464b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0)
46540e23c03SJunchao Zhang 
46640e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
46740e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
46840e23c03SJunchao Zhang {
46940e23c03SJunchao Zhang   int ierr;
47040e23c03SJunchao Zhang   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
47140e23c03SJunchao Zhang   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
47240e23c03SJunchao Zhang   return MPI_SUCCESS;
47340e23c03SJunchao Zhang }
47440e23c03SJunchao Zhang #endif
47540e23c03SJunchao Zhang 
476cd620004SJunchao Zhang /*
477cd620004SJunchao Zhang    The routine Creates a communication link for the given operation. It first looks up its link cache. If
478cd620004SJunchao Zhang    there is a free & suitable one, it uses it. Otherwise it creates a new one.
479cd620004SJunchao Zhang 
480cd620004SJunchao Zhang    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
481cd620004SJunchao Zhang    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
482cd620004SJunchao Zhang    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
483cd620004SJunchao Zhang    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
484cd620004SJunchao Zhang 
485cd620004SJunchao Zhang    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
486cd620004SJunchao Zhang 
487cd620004SJunchao Zhang    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
488cd620004SJunchao Zhang 
489cd620004SJunchao Zhang    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
490cd620004SJunchao Zhang    need pack/unpack data.
491cd620004SJunchao Zhang */
492*a247e58cSJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
49340e23c03SJunchao Zhang {
49440e23c03SJunchao Zhang   PetscErrorCode    ierr;
495cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
496cd620004SJunchao Zhang   PetscInt          i,j,k,nrootreqs,nleafreqs,nreqs;
497cd620004SJunchao Zhang   PetscSFLink       *p,link;
498cd620004SJunchao Zhang   PetscSFDirection  direction;
499cd620004SJunchao Zhang   MPI_Request       *reqs = NULL;
500cd620004SJunchao Zhang   PetscBool         match,rootdirect[2],leafdirect[2];
501*a247e58cSJunchao Zhang   PetscMemType      rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1*/
502*a247e58cSJunchao Zhang   PetscMemType      leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
503cd620004SJunchao Zhang   PetscMemType      rootmtype_mpi,leafmtype_mpi;   /* mtypes seen by MPI */
504cd620004SJunchao Zhang   PetscInt          rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
505cd620004SJunchao Zhang 
506cd620004SJunchao Zhang   PetscFunctionBegin;
507cd620004SJunchao Zhang   ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr);
508cd620004SJunchao Zhang 
509cd620004SJunchao Zhang   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
510cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
511cd620004SJunchao Zhang     if (sfop == PETSCSF_BCAST) {
512cd620004SJunchao Zhang       rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
513cd620004SJunchao Zhang       leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE;  /* Unpack leaves */
514cd620004SJunchao Zhang     } else if (sfop == PETSCSF_REDUCE) {
515cd620004SJunchao Zhang       leafdirect[i] = sf->leafcontig[i];  /* Pack leaves */
516cd620004SJunchao Zhang       rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
517cd620004SJunchao Zhang     } else { /* PETSCSF_FETCH */
518cd620004SJunchao Zhang       rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
519cd620004SJunchao Zhang       leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
520cd620004SJunchao Zhang     }
521cd620004SJunchao Zhang   }
522cd620004SJunchao Zhang 
523c2a741eeSJunchao Zhang   if (sf->use_gpu_aware_mpi) {
524cd620004SJunchao Zhang     rootmtype_mpi = rootmtype;
525cd620004SJunchao Zhang     leafmtype_mpi = leafmtype;
526cd620004SJunchao Zhang   } else {
527cd620004SJunchao Zhang     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
528cd620004SJunchao Zhang   }
529cd620004SJunchao Zhang   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
530cd620004SJunchao Zhang   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
531cd620004SJunchao Zhang   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
532cd620004SJunchao Zhang 
533cd620004SJunchao Zhang   direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
534cd620004SJunchao Zhang   nrootreqs = bas->nrootreqs;
535cd620004SJunchao Zhang   nleafreqs = sf->nleafreqs;
536cd620004SJunchao Zhang 
537cd620004SJunchao Zhang   /* Look for free links in cache */
538cd620004SJunchao Zhang   for (p=&bas->avail; (link=*p); p=&link->next) {
539cd620004SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
540cd620004SJunchao Zhang     if (match) {
541cd620004SJunchao Zhang       /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
542cd620004SJunchao Zhang          If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
543cd620004SJunchao Zhang       */
544cd620004SJunchao Zhang       if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
545cd620004SJunchao Zhang         reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
546cd620004SJunchao Zhang         for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
547cd620004SJunchao Zhang         link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
548cd620004SJunchao Zhang       }
549cd620004SJunchao Zhang       if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
550cd620004SJunchao Zhang         reqs = link->leafreqs[direction][leafmtype][1];
551cd620004SJunchao Zhang         for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}}
552cd620004SJunchao Zhang         link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
553cd620004SJunchao Zhang       }
554cd620004SJunchao Zhang       *p = link->next; /* Remove from available list */
555cd620004SJunchao Zhang       goto found;
556cd620004SJunchao Zhang     }
557cd620004SJunchao Zhang   }
558cd620004SJunchao Zhang 
559cd620004SJunchao Zhang   ierr = PetscNew(&link);CHKERRQ(ierr);
560cd620004SJunchao Zhang   ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr);
561cd620004SJunchao Zhang   ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */
562cd620004SJunchao Zhang 
563cd620004SJunchao Zhang   nreqs = (nrootreqs+nleafreqs)*8;
564cd620004SJunchao Zhang   ierr  = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr);
565cd620004SJunchao Zhang   for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
566cd620004SJunchao Zhang 
567cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Two communication directions */
568cd620004SJunchao Zhang     for (j=0; j<2; j++) { /* Two memory types */
569cd620004SJunchao Zhang       for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
570cd620004SJunchao Zhang         link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
571cd620004SJunchao Zhang         link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
572cd620004SJunchao Zhang       }
573cd620004SJunchao Zhang     }
574cd620004SJunchao Zhang   }
575cd620004SJunchao Zhang 
576cd620004SJunchao Zhang found:
57720c24465SJunchao Zhang 
5787fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
579*a247e58cSJunchao Zhang   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
58020c24465SJunchao Zhang     #if defined(PETSC_HAVE_CUDA)
58120c24465SJunchao Zhang       if (sf->backend == PETSCSF_BACKEND_CUDA)   {ierr = PetscSFLinkSetUp_Cuda(sf,link,unit);CHKERRQ(ierr);}
58220c24465SJunchao Zhang     #endif
58320c24465SJunchao Zhang     #if defined(PETSC_HAVE_KOKKOS)
58420c24465SJunchao Zhang       if (sf->backend == PETSCSF_BACKEND_KOKKOS) {ierr = PetscSFLinkSetUp_Kokkos(sf,link,unit);CHKERRQ(ierr);}
58520c24465SJunchao Zhang     #endif
58620c24465SJunchao Zhang   }
5877fd2d3dbSJunchao Zhang #endif
588cd620004SJunchao Zhang 
589cd620004SJunchao Zhang   /* Allocate buffers along root/leafdata */
590cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
591cd620004SJunchao Zhang     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
592cd620004SJunchao Zhang     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
593cd620004SJunchao Zhang     if (bas->rootbuflen[i]) {
594cd620004SJunchao Zhang       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
595cd620004SJunchao Zhang         link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
596cd620004SJunchao Zhang       } else { /* Have to have a separate rootbuf */
597cd620004SJunchao Zhang         if (!link->rootbuf_alloc[i][rootmtype]) {
59820c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr);
599cd620004SJunchao Zhang         }
600cd620004SJunchao Zhang         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
601cd620004SJunchao Zhang       }
602cd620004SJunchao Zhang     }
603cd620004SJunchao Zhang 
604cd620004SJunchao Zhang     if (sf->leafbuflen[i]) {
605cd620004SJunchao Zhang       if (leafdirect[i]) {
606cd620004SJunchao Zhang         link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
607cd620004SJunchao Zhang       } else {
608cd620004SJunchao Zhang         if (!link->leafbuf_alloc[i][leafmtype]) {
60920c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr);
610cd620004SJunchao Zhang         }
611cd620004SJunchao Zhang         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
612cd620004SJunchao Zhang       }
613cd620004SJunchao Zhang     }
614cd620004SJunchao Zhang   }
615cd620004SJunchao Zhang 
6167fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
617cd620004SJunchao Zhang   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
618cd620004SJunchao Zhang   if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
619cd620004SJunchao Zhang     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
620cd620004SJunchao Zhang       ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
621cd620004SJunchao Zhang     }
622cd620004SJunchao Zhang     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
623cd620004SJunchao Zhang   }
624cd620004SJunchao Zhang   if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
625cd620004SJunchao Zhang     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
626cd620004SJunchao Zhang       ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
627cd620004SJunchao Zhang     }
628cd620004SJunchao Zhang     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
629cd620004SJunchao Zhang   }
6307fd2d3dbSJunchao Zhang #endif
631cd620004SJunchao Zhang 
632cd620004SJunchao Zhang   /* Set `current` state of the link. They may change between different SF invocations with the same link */
633cd620004SJunchao Zhang   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
634cd620004SJunchao Zhang     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
635cd620004SJunchao Zhang     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
636cd620004SJunchao Zhang   }
637cd620004SJunchao Zhang 
638cd620004SJunchao Zhang   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
639cd620004SJunchao Zhang   link->leafdata = leafdata;
640cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
641cd620004SJunchao Zhang     link->rootdirect[i] = rootdirect[i];
642cd620004SJunchao Zhang     link->leafdirect[i] = leafdirect[i];
643cd620004SJunchao Zhang   }
644cd620004SJunchao Zhang   link->rootdirect_mpi  = rootdirect_mpi;
645cd620004SJunchao Zhang   link->leafdirect_mpi  = leafdirect_mpi;
646cd620004SJunchao Zhang   link->rootmtype       = rootmtype;
647cd620004SJunchao Zhang   link->leafmtype       = leafmtype;
648cd620004SJunchao Zhang   link->rootmtype_mpi   = rootmtype_mpi;
649cd620004SJunchao Zhang   link->leafmtype_mpi   = leafmtype_mpi;
650cd620004SJunchao Zhang 
651cd620004SJunchao Zhang   link->next            = bas->inuse;
652cd620004SJunchao Zhang   bas->inuse            = link;
653cd620004SJunchao Zhang   *mylink               = link;
654cd620004SJunchao Zhang   PetscFunctionReturn(0);
655cd620004SJunchao Zhang }
656cd620004SJunchao Zhang 
657cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
658cd620004SJunchao Zhang    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
659cd620004SJunchao Zhang */
660cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
661cd620004SJunchao Zhang {
662cd620004SJunchao Zhang   PetscErrorCode       ierr;
663cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
664cd620004SJunchao Zhang   PetscInt             i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
665cd620004SJunchao Zhang   const PetscInt       *rootoffset,*leafoffset;
666cd620004SJunchao Zhang   PetscMPIInt          n;
667cd620004SJunchao Zhang   MPI_Aint             disp;
668cd620004SJunchao Zhang   MPI_Comm             comm = PetscObjectComm((PetscObject)sf);
669cd620004SJunchao Zhang   MPI_Datatype         unit = link->unit;
670cd620004SJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
671cd620004SJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
672cd620004SJunchao Zhang 
673cd620004SJunchao Zhang   PetscFunctionBegin;
674cd620004SJunchao Zhang   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
675cd620004SJunchao Zhang   if (sf->persistent) {
676cd620004SJunchao Zhang     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
677cd620004SJunchao Zhang       ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr);
678cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
679cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
680cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
681cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
682cd620004SJunchao Zhang           ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
683cd620004SJunchao Zhang         }
684cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
685cd620004SJunchao Zhang         for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
686cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
687cd620004SJunchao Zhang           ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr);
688cd620004SJunchao Zhang           ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr);
689cd620004SJunchao Zhang         }
690cd620004SJunchao Zhang       }
691cd620004SJunchao Zhang       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
692cd620004SJunchao Zhang     }
693cd620004SJunchao Zhang 
694cd620004SJunchao Zhang     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
695cd620004SJunchao Zhang       ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr);
696cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
697cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
698cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
699cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
700cd620004SJunchao Zhang           ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
701cd620004SJunchao Zhang         }
702cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
703cd620004SJunchao Zhang         for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
704cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
705cd620004SJunchao Zhang           ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr);
706cd620004SJunchao Zhang           ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr);
707cd620004SJunchao Zhang         }
708cd620004SJunchao Zhang       }
709cd620004SJunchao Zhang       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
710cd620004SJunchao Zhang     }
711cd620004SJunchao Zhang   }
712cd620004SJunchao Zhang   if (rootbuf)  *rootbuf  = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
713cd620004SJunchao Zhang   if (leafbuf)  *leafbuf  = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
714cd620004SJunchao Zhang   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
715cd620004SJunchao Zhang   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
716cd620004SJunchao Zhang   PetscFunctionReturn(0);
717cd620004SJunchao Zhang }
718cd620004SJunchao Zhang 
719cd620004SJunchao Zhang 
720cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
721cd620004SJunchao Zhang {
722cd620004SJunchao Zhang   PetscErrorCode    ierr;
723cd620004SJunchao Zhang   PetscSFLink       link,*p;
72440e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
72540e23c03SJunchao Zhang 
72640e23c03SJunchao Zhang   PetscFunctionBegin;
72740e23c03SJunchao Zhang   /* Look for types in cache */
72840e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
72940e23c03SJunchao Zhang     PetscBool match;
73040e23c03SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
731637e6665SJunchao Zhang     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
73240e23c03SJunchao Zhang       switch (cmode) {
73340e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
73440e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
73540e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
73640e23c03SJunchao Zhang       }
73740e23c03SJunchao Zhang       *mylink = link;
73840e23c03SJunchao Zhang       PetscFunctionReturn(0);
73940e23c03SJunchao Zhang     }
74040e23c03SJunchao Zhang   }
74140e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
74240e23c03SJunchao Zhang   PetscFunctionReturn(0);
74340e23c03SJunchao Zhang }
74440e23c03SJunchao Zhang 
745cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
74640e23c03SJunchao Zhang {
74740e23c03SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
74840e23c03SJunchao Zhang 
74940e23c03SJunchao Zhang   PetscFunctionBegin;
750637e6665SJunchao Zhang   (*link)->rootdata = NULL;
751637e6665SJunchao Zhang   (*link)->leafdata = NULL;
75240e23c03SJunchao Zhang   (*link)->next     = bas->avail;
75340e23c03SJunchao Zhang   bas->avail        = *link;
75440e23c03SJunchao Zhang   *link             = NULL;
75540e23c03SJunchao Zhang   PetscFunctionReturn(0);
75640e23c03SJunchao Zhang }
75740e23c03SJunchao Zhang 
758cd620004SJunchao Zhang /* Destroy all links chained in 'avail' */
759cd620004SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
760eb02082bSJunchao Zhang {
761eb02082bSJunchao Zhang   PetscErrorCode    ierr;
762cd620004SJunchao Zhang   PetscSFLink       link = *avail,next;
763cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
764cd620004SJunchao Zhang   PetscInt          i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
765eb02082bSJunchao Zhang 
766eb02082bSJunchao Zhang   PetscFunctionBegin;
767eb02082bSJunchao Zhang   for (; link; link=next) {
768eb02082bSJunchao Zhang     next = link->next;
769eb02082bSJunchao Zhang     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
770cd620004SJunchao Zhang     for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
771eb02082bSJunchao Zhang       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
772eb02082bSJunchao Zhang     }
773eb02082bSJunchao Zhang     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
774cd620004SJunchao Zhang     for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
775cd620004SJunchao Zhang       ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
776cd620004SJunchao Zhang       ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
7777fd2d3dbSJunchao Zhang       #if defined(PETSC_HAVE_DEVICE)
77820c24465SJunchao Zhang       ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
77920c24465SJunchao Zhang       ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
7807fd2d3dbSJunchao Zhang       #endif
781cd620004SJunchao Zhang     }
7827fd2d3dbSJunchao Zhang   #if defined(PETSC_HAVE_DEVICE)
7837fd2d3dbSJunchao Zhang     if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);}
7847fd2d3dbSJunchao Zhang    #if defined(PETSC_HAVE_CUDA)
7857fd2d3dbSJunchao Zhang     if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;}
7867fd2d3dbSJunchao Zhang    #elif defined(PETSC_HAVE_HIP)
7877fd2d3dbSJunchao Zhang     if (link->stream) {hipError_t  cerr = hipStreamDestroy(link->stream);CHKERRQ(cerr); link->stream = NULL;} /* TODO: CHKERRHIP? */
7887fd2d3dbSJunchao Zhang    #endif
7897fd2d3dbSJunchao Zhang   #endif
790eb02082bSJunchao Zhang     ierr = PetscFree(link);CHKERRQ(ierr);
791eb02082bSJunchao Zhang   }
792eb02082bSJunchao Zhang   *avail = NULL;
793eb02082bSJunchao Zhang   PetscFunctionReturn(0);
794eb02082bSJunchao Zhang }
795eb02082bSJunchao Zhang 
7969d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
797cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
7989d1c8addSJunchao Zhang {
7999d1c8addSJunchao Zhang   PetscErrorCode    ierr;
800cd620004SJunchao Zhang   PetscSFLink       link,*p;
8019d1c8addSJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic*)sf->data;
8029d1c8addSJunchao Zhang   PetscBool         match;
8039d1c8addSJunchao Zhang 
8049d1c8addSJunchao Zhang   PetscFunctionBegin;
8054e1ad211SJed Brown   if (!PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
80618fb5014SJunchao Zhang   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
80718fb5014SJunchao Zhang      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
80818fb5014SJunchao Zhang   */
809637e6665SJunchao Zhang   if (rootdata || leafdata) {
8109d1c8addSJunchao Zhang     for (p=&bas->inuse; (link=*p); p=&link->next) {
8119d1c8addSJunchao Zhang       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
812cd620004SJunchao Zhang       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
8139d1c8addSJunchao Zhang     }
81418fb5014SJunchao Zhang   }
8159d1c8addSJunchao Zhang   PetscFunctionReturn(0);
8169d1c8addSJunchao Zhang }
8179d1c8addSJunchao Zhang 
81820c24465SJunchao Zhang static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
81920c24465SJunchao Zhang {
82020c24465SJunchao Zhang   PetscFunctionBegin;
82120c24465SJunchao Zhang   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
82220c24465SJunchao Zhang   PetscFunctionReturn(0);
82320c24465SJunchao Zhang }
82420c24465SJunchao Zhang 
82520c24465SJunchao Zhang 
826cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
82740e23c03SJunchao Zhang {
82840e23c03SJunchao Zhang   PetscErrorCode ierr;
829b23bfdefSJunchao Zhang   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
830b23bfdefSJunchao Zhang   PetscBool      is2Int,is2PetscInt;
83140e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
83240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
833b23bfdefSJunchao Zhang   PetscInt       nPetscComplex=0;
83440e23c03SJunchao Zhang #endif
83540e23c03SJunchao Zhang 
83640e23c03SJunchao Zhang   PetscFunctionBegin;
837b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
838b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
839b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
840b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
841b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
842b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
84340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
844b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
84540e23c03SJunchao Zhang #endif
84640e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
84740e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
848b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
84940e23c03SJunchao Zhang   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
8505ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
851b23bfdefSJunchao Zhang   link->bs = 1; /* default */
85240e23c03SJunchao Zhang 
853eb02082bSJunchao Zhang   if (is2Int) {
854cd620004SJunchao Zhang     PackInit_PairType_int_int_1_1(link);
855eb02082bSJunchao Zhang     link->bs        = 1;
856eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(int);
8575ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
858eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
8595ad15460SJunchao Zhang     link->unit      = MPI_2INT;
860eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
861cd620004SJunchao Zhang     PackInit_PairType_PetscInt_PetscInt_1_1(link);
862eb02082bSJunchao Zhang     link->bs        = 1;
863eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(PetscInt);
864eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
8655ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
8665ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
867eb02082bSJunchao Zhang   } else if (nPetscReal) {
868b23bfdefSJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
869b23bfdefSJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
870b23bfdefSJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
871b23bfdefSJunchao Zhang     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
872b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
873eb02082bSJunchao Zhang     link->unitbytes = nPetscReal*sizeof(PetscReal);
87440e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
8755ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
876b23bfdefSJunchao Zhang   } else if (nPetscInt) {
877b23bfdefSJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
878b23bfdefSJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
879b23bfdefSJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
880b23bfdefSJunchao Zhang     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
881b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
882eb02082bSJunchao Zhang     link->unitbytes = nPetscInt*sizeof(PetscInt);
883b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
8845ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
885b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
886b23bfdefSJunchao Zhang   } else if (nInt) {
887b23bfdefSJunchao Zhang     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
888b23bfdefSJunchao Zhang     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
889b23bfdefSJunchao Zhang     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
890b23bfdefSJunchao Zhang     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
891b23bfdefSJunchao Zhang     link->bs        = nInt;
892eb02082bSJunchao Zhang     link->unitbytes = nInt*sizeof(int);
893b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
8945ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
895b23bfdefSJunchao Zhang #endif
896b23bfdefSJunchao Zhang   } else if (nSignedChar) {
897b23bfdefSJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
898b23bfdefSJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
899b23bfdefSJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
900b23bfdefSJunchao Zhang     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
901b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
902eb02082bSJunchao Zhang     link->unitbytes = nSignedChar*sizeof(SignedChar);
903b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
9045ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
905b23bfdefSJunchao Zhang   }  else if (nUnsignedChar) {
906b23bfdefSJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
907b23bfdefSJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
908b23bfdefSJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
909b23bfdefSJunchao Zhang     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
910b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
911eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
912b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
9135ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
91440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
915b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
916b23bfdefSJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
917b23bfdefSJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
918b23bfdefSJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
919b23bfdefSJunchao Zhang     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
920b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
921eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
92240e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
9235ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
92440e23c03SJunchao Zhang #endif
92540e23c03SJunchao Zhang   } else {
926b23bfdefSJunchao Zhang     MPI_Aint lb,nbyte;
927b23bfdefSJunchao Zhang     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
92840e23c03SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
929eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
930eb02082bSJunchao Zhang       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
931eb02082bSJunchao Zhang       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
932eb02082bSJunchao Zhang       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
933eb02082bSJunchao Zhang       link->bs        = nbyte;
934b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
935b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
93640e23c03SJunchao Zhang     } else {
937eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
938eb02082bSJunchao Zhang       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
939eb02082bSJunchao Zhang       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
940eb02082bSJunchao Zhang       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
941eb02082bSJunchao Zhang       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
942eb02082bSJunchao Zhang       link->bs        = nInt;
943b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
94440e23c03SJunchao Zhang       link->basicunit = MPI_INT;
94540e23c03SJunchao Zhang     }
9465ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
94740e23c03SJunchao Zhang   }
948b23bfdefSJunchao Zhang 
9495ad15460SJunchao Zhang   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
95020c24465SJunchao Zhang 
95120c24465SJunchao Zhang   link->Memcpy = PetscSFLinkMemcpy_Host;
95240e23c03SJunchao Zhang   PetscFunctionReturn(0);
95340e23c03SJunchao Zhang }
95440e23c03SJunchao Zhang 
955fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
95640e23c03SJunchao Zhang {
95740e23c03SJunchao Zhang   PetscFunctionBegin;
95840e23c03SJunchao Zhang   *UnpackAndOp = NULL;
959eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
960eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
961eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
962eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
963eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
964eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
965eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
966eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
967eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
968eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
969eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
970eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
971eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
972eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
973eb02082bSJunchao Zhang   }
9747fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
975eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
976eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
977eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
978eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
979eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
980eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
981eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
982eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
983eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
984eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
985eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
986eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
987eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
988eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
989eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
990eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
991eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
992eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
993eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
994eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
995eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
996eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
997eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
998eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
999eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
1000eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
1001eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
1002eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
1003eb02082bSJunchao Zhang   }
1004eb02082bSJunchao Zhang #endif
100540e23c03SJunchao Zhang   PetscFunctionReturn(0);
100640e23c03SJunchao Zhang }
100740e23c03SJunchao Zhang 
1008fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
100940e23c03SJunchao Zhang {
101040e23c03SJunchao Zhang   PetscFunctionBegin;
1011cd620004SJunchao Zhang   *ScatterAndOp = NULL;
1012eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
1013cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->h_ScatterAndInsert;
1014cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
1015cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->h_ScatterAndMult;
1016cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
1017cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
1018cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->h_ScatterAndLAND;
1019cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->h_ScatterAndBAND;
1020cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->h_ScatterAndLOR;
1021cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->h_ScatterAndBOR;
1022cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->h_ScatterAndLXOR;
1023cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->h_ScatterAndBXOR;
1024cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->h_ScatterAndMaxloc;
1025cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->h_ScatterAndMinloc;
1026eb02082bSJunchao Zhang   }
10277fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1028eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
1029cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->d_ScatterAndInsert;
1030cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
1031cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->d_ScatterAndMult;
1032cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1033cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1034cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->d_ScatterAndLAND;
1035cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->d_ScatterAndBAND;
1036cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->d_ScatterAndLOR;
1037cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->d_ScatterAndBOR;
1038cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->d_ScatterAndLXOR;
1039cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->d_ScatterAndBXOR;
1040cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->d_ScatterAndMaxloc;
1041cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->d_ScatterAndMinloc;
1042eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1043cd620004SJunchao Zhang     if      (op == MPIU_REPLACE)              *ScatterAndOp = link->da_ScatterAndInsert;
1044cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1045cd620004SJunchao Zhang     else if (op == MPI_PROD)                  *ScatterAndOp = link->da_ScatterAndMult;
1046cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1047cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1048cd620004SJunchao Zhang     else if (op == MPI_LAND)                  *ScatterAndOp = link->da_ScatterAndLAND;
1049cd620004SJunchao Zhang     else if (op == MPI_BAND)                  *ScatterAndOp = link->da_ScatterAndBAND;
1050cd620004SJunchao Zhang     else if (op == MPI_LOR)                   *ScatterAndOp = link->da_ScatterAndLOR;
1051cd620004SJunchao Zhang     else if (op == MPI_BOR)                   *ScatterAndOp = link->da_ScatterAndBOR;
1052cd620004SJunchao Zhang     else if (op == MPI_LXOR)                  *ScatterAndOp = link->da_ScatterAndLXOR;
1053cd620004SJunchao Zhang     else if (op == MPI_BXOR)                  *ScatterAndOp = link->da_ScatterAndBXOR;
1054cd620004SJunchao Zhang     else if (op == MPI_MAXLOC)                *ScatterAndOp = link->da_ScatterAndMaxloc;
1055cd620004SJunchao Zhang     else if (op == MPI_MINLOC)                *ScatterAndOp = link->da_ScatterAndMinloc;
1056eb02082bSJunchao Zhang   }
1057eb02082bSJunchao Zhang #endif
1058cd620004SJunchao Zhang   PetscFunctionReturn(0);
1059cd620004SJunchao Zhang }
1060cd620004SJunchao Zhang 
1061fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1062cd620004SJunchao Zhang {
1063cd620004SJunchao Zhang   PetscFunctionBegin;
1064cd620004SJunchao Zhang   *FetchAndOp = NULL;
1065cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1066cd620004SJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
10677fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1068cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1069cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOp = link->da_FetchAndAdd;
1070cd620004SJunchao Zhang #endif
1071cd620004SJunchao Zhang   PetscFunctionReturn(0);
1072cd620004SJunchao Zhang }
1073cd620004SJunchao Zhang 
1074fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1075cd620004SJunchao Zhang {
1076cd620004SJunchao Zhang   PetscFunctionBegin;
1077cd620004SJunchao Zhang   *FetchAndOpLocal = NULL;
1078cd620004SJunchao Zhang   if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1079cd620004SJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
10807fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1081cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1082cd620004SJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && atomic)  *FetchAndOpLocal = link->da_FetchAndAddLocal;
1083cd620004SJunchao Zhang #endif
1084cd620004SJunchao Zhang   PetscFunctionReturn(0);
1085cd620004SJunchao Zhang }
1086cd620004SJunchao Zhang 
1087cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1088cd620004SJunchao Zhang {
1089cd620004SJunchao Zhang   PetscErrorCode ierr;
1090cd620004SJunchao Zhang   PetscLogDouble flops;
1091cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1092cd620004SJunchao Zhang 
1093cd620004SJunchao Zhang 
1094cd620004SJunchao Zhang   PetscFunctionBegin;
1095cd620004SJunchao Zhang   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1096cd620004SJunchao Zhang     flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
10977fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1098cd620004SJunchao Zhang     if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1099cd620004SJunchao Zhang #endif
1100cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1101cd620004SJunchao Zhang   }
1102cd620004SJunchao Zhang   PetscFunctionReturn(0);
1103cd620004SJunchao Zhang }
1104cd620004SJunchao Zhang 
1105cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1106cd620004SJunchao Zhang {
1107cd620004SJunchao Zhang   PetscLogDouble flops;
1108cd620004SJunchao Zhang   PetscErrorCode ierr;
1109cd620004SJunchao Zhang 
1110cd620004SJunchao Zhang   PetscFunctionBegin;
1111cd620004SJunchao Zhang   if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1112cd620004SJunchao Zhang     flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
11137fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1114cd620004SJunchao Zhang     if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else
1115cd620004SJunchao Zhang #endif
1116cd620004SJunchao Zhang     {ierr = PetscLogFlops(flops);CHKERRQ(ierr);}
1117cd620004SJunchao Zhang   }
1118cd620004SJunchao Zhang   PetscFunctionReturn(0);
1119cd620004SJunchao Zhang }
1120cd620004SJunchao Zhang 
1121cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1122cd620004SJunchao Zhang   Input Arguments:
1123cd620004SJunchao Zhang   +sf      - The StarForest
1124cd620004SJunchao Zhang   .link    - The link
1125cd620004SJunchao Zhang   .count   - Number of entries to unpack
1126cd620004SJunchao Zhang   .start   - The first index, significent when indices=NULL
1127cd620004SJunchao Zhang   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1128cd620004SJunchao Zhang   .buf     - A contiguous buffer to unpack from
1129cd620004SJunchao Zhang   -op      - Operation after unpack
1130cd620004SJunchao Zhang 
1131cd620004SJunchao Zhang   Output Arguments:
1132cd620004SJunchao Zhang   .data    - The data to unpack to
1133cd620004SJunchao Zhang */
1134cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1135cd620004SJunchao Zhang {
1136cd620004SJunchao Zhang   PetscFunctionBegin;
1137cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1138cd620004SJunchao Zhang   {
1139cd620004SJunchao Zhang     PetscErrorCode ierr;
1140cd620004SJunchao Zhang     PetscInt       i;
1141cd620004SJunchao Zhang     PetscMPIInt    n;
1142cd620004SJunchao Zhang     if (indices) {
1143cd620004SJunchao Zhang       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1144cd620004SJunchao Zhang          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1145cd620004SJunchao Zhang       */
1146cd620004SJunchao Zhang       for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
1147cd620004SJunchao Zhang     } else {
1148cd620004SJunchao Zhang       ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr);
1149cd620004SJunchao Zhang       ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr);
1150cd620004SJunchao Zhang     }
1151cd620004SJunchao Zhang   }
1152cd620004SJunchao Zhang #else
1153cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1154cd620004SJunchao Zhang #endif
1155cd620004SJunchao Zhang   PetscFunctionReturn(0);
1156cd620004SJunchao Zhang }
1157cd620004SJunchao Zhang 
1158fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1159cd620004SJunchao Zhang {
1160cd620004SJunchao Zhang   PetscFunctionBegin;
1161cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1162cd620004SJunchao Zhang   {
1163cd620004SJunchao Zhang     PetscErrorCode ierr;
1164cd620004SJunchao Zhang     PetscInt       i,disp;
1165fcc7397dSJunchao Zhang     if (!srcIdx) {
1166fcc7397dSJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr);
1167fcc7397dSJunchao Zhang     } else {
1168cd620004SJunchao Zhang       for (i=0; i<count; i++) {
1169fcc7397dSJunchao Zhang         disp = dstIdx? dstIdx[i] : dstStart + i;
1170fcc7397dSJunchao Zhang         ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);
1171fcc7397dSJunchao Zhang       }
1172cd620004SJunchao Zhang     }
1173cd620004SJunchao Zhang   }
1174cd620004SJunchao Zhang #else
1175cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1176cd620004SJunchao Zhang #endif
1177cd620004SJunchao Zhang   PetscFunctionReturn(0);
1178cd620004SJunchao Zhang }
1179cd620004SJunchao Zhang 
1180cd620004SJunchao Zhang /*=============================================================================
1181cd620004SJunchao Zhang               Pack/Unpack/Fetch/Scatter routines
1182cd620004SJunchao Zhang  ============================================================================*/
1183cd620004SJunchao Zhang 
1184cd620004SJunchao Zhang /* Pack rootdata to rootbuf
1185cd620004SJunchao Zhang   Input Arguments:
1186cd620004SJunchao Zhang   + sf       - The SF this packing works on.
1187cd620004SJunchao Zhang   . link     - It gives the memtype of the roots and also provides root buffer.
1188cd620004SJunchao Zhang   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1189cd620004SJunchao Zhang   - rootdata - Where to read the roots.
1190cd620004SJunchao Zhang 
1191cd620004SJunchao Zhang   Notes:
1192cd620004SJunchao Zhang   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1193cd620004SJunchao Zhang   in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1194cd620004SJunchao Zhang  */
1195cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1196cd620004SJunchao Zhang {
1197cd620004SJunchao Zhang   PetscErrorCode   ierr;
1198cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1199cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1200cd620004SJunchao Zhang   PetscInt         count,start;
1201fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1202cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1203fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1204fcc7397dSJunchao Zhang 
1205cd620004SJunchao Zhang   PetscFunctionBegin;
1206cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1207cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1208cd620004SJunchao Zhang   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1209fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1210cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr);
1211fcc7397dSJunchao Zhang     ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1212cd620004SJunchao Zhang   }
1213cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1214cd620004SJunchao Zhang     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1215cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr);
1216cd620004SJunchao Zhang   }
1217cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1218cd620004SJunchao Zhang   PetscFunctionReturn(0);
1219cd620004SJunchao Zhang }
1220cd620004SJunchao Zhang 
1221cd620004SJunchao Zhang /* Pack leafdata to leafbuf */
1222cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1223cd620004SJunchao Zhang {
1224cd620004SJunchao Zhang   PetscErrorCode   ierr;
1225cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1226cd620004SJunchao Zhang   PetscInt         count,start;
1227fcc7397dSJunchao Zhang   PetscErrorCode   (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1228cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1229fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1230cd620004SJunchao Zhang 
1231cd620004SJunchao Zhang   PetscFunctionBegin;
1232cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1233cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);}
1234cd620004SJunchao Zhang   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1235fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1236cd620004SJunchao Zhang     ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr);
1237fcc7397dSJunchao Zhang     ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1238cd620004SJunchao Zhang   }
1239cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1240cd620004SJunchao Zhang     ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr);
1241cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr);
1242cd620004SJunchao Zhang   }
1243cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr);
1244cd620004SJunchao Zhang   PetscFunctionReturn(0);
1245cd620004SJunchao Zhang }
1246cd620004SJunchao Zhang 
1247cd620004SJunchao Zhang /* Unpack rootbuf to rootdata */
1248cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1249cd620004SJunchao Zhang {
1250cd620004SJunchao Zhang   PetscErrorCode   ierr;
1251cd620004SJunchao Zhang   const PetscInt   *rootindices = NULL;
1252cd620004SJunchao Zhang   PetscInt         count,start;
1253cd620004SJunchao Zhang   PetscSF_Basic    *bas = (PetscSF_Basic*)sf->data;
1254fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1255cd620004SJunchao Zhang   PetscMemType     rootmtype = link->rootmtype;
1256fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1257cd620004SJunchao Zhang 
1258cd620004SJunchao Zhang   PetscFunctionBegin;
1259cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1260cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1261cd620004SJunchao Zhang   if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1262cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1263cd620004SJunchao Zhang     if (UnpackAndOp) {
1264fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1265fcc7397dSJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1266cd620004SJunchao Zhang     } else {
1267fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1268cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr);
1269cd620004SJunchao Zhang     }
1270cd620004SJunchao Zhang   }
1271cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);}
1272cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1273cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1274cd620004SJunchao Zhang   PetscFunctionReturn(0);
1275cd620004SJunchao Zhang }
1276cd620004SJunchao Zhang 
1277cd620004SJunchao Zhang /* Unpack leafbuf to leafdata */
1278cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1279cd620004SJunchao Zhang {
1280cd620004SJunchao Zhang   PetscErrorCode   ierr;
1281cd620004SJunchao Zhang   const PetscInt   *leafindices = NULL;
1282cd620004SJunchao Zhang   PetscInt         count,start;
1283fcc7397dSJunchao Zhang   PetscErrorCode   (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1284cd620004SJunchao Zhang   PetscMemType     leafmtype = link->leafmtype;
1285fcc7397dSJunchao Zhang   PetscSFPackOpt   opt = NULL;
1286cd620004SJunchao Zhang 
1287cd620004SJunchao Zhang   PetscFunctionBegin;
1288cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1289cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1290cd620004SJunchao Zhang   if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1291cd620004SJunchao Zhang     ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr);
1292cd620004SJunchao Zhang     if (UnpackAndOp) {
1293fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1294fcc7397dSJunchao Zhang       ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr);
1295cd620004SJunchao Zhang     } else {
1296fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr);
1297cd620004SJunchao Zhang       ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr);
1298cd620004SJunchao Zhang     }
1299cd620004SJunchao Zhang   }
1300cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);}
1301cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr);
1302cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1303cd620004SJunchao Zhang   PetscFunctionReturn(0);
1304cd620004SJunchao Zhang }
1305cd620004SJunchao Zhang 
1306cd620004SJunchao Zhang /* FetchAndOp rootdata with rootbuf */
1307cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1308cd620004SJunchao Zhang {
1309cd620004SJunchao Zhang   PetscErrorCode     ierr;
1310cd620004SJunchao Zhang   const PetscInt     *rootindices = NULL;
1311cd620004SJunchao Zhang   PetscInt           count,start;
1312cd620004SJunchao Zhang   PetscSF_Basic      *bas = (PetscSF_Basic*)sf->data;
1313fcc7397dSJunchao Zhang   PetscErrorCode     (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1314cd620004SJunchao Zhang   PetscMemType       rootmtype = link->rootmtype;
1315fcc7397dSJunchao Zhang   PetscSFPackOpt     opt = NULL;
1316cd620004SJunchao Zhang 
1317cd620004SJunchao Zhang   PetscFunctionBegin;
1318cd620004SJunchao Zhang   ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1319cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);}
1320cd620004SJunchao Zhang   if (bas->rootbuflen[scope]) {
1321cd620004SJunchao Zhang     /* Do FetchAndOp on rootdata with rootbuf */
1322cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr);
1323fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr);
1324fcc7397dSJunchao Zhang     ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr);
1325cd620004SJunchao Zhang   }
1326cd620004SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
1327cd620004SJunchao Zhang     ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr);
1328cd620004SJunchao Zhang     ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);
1329cd620004SJunchao Zhang   }
1330cd620004SJunchao Zhang   ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr);
1331cd620004SJunchao Zhang   ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr);
1332cd620004SJunchao Zhang   PetscFunctionReturn(0);
1333cd620004SJunchao Zhang }
1334cd620004SJunchao Zhang 
1335cd620004SJunchao Zhang /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1336cd620004SJunchao Zhang PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1337cd620004SJunchao Zhang {
1338cd620004SJunchao Zhang   PetscErrorCode       ierr;
1339cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1340cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1341cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1342fcc7397dSJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1343cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1344fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1345cd620004SJunchao Zhang 
1346cd620004SJunchao Zhang   PetscFunctionBegin;
1347cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1348cd620004SJunchao Zhang   if (rootmtype != leafmtype) { /* Uncommon case */
1349cd620004SJunchao Zhang      /* The local communication has to go through pack and unpack */
1350cd620004SJunchao Zhang     ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr);
135120c24465SJunchao Zhang     ierr = (*link->Memcpy)(link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1352cd620004SJunchao Zhang     ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr);
1353cd620004SJunchao Zhang   } else {
1354cd620004SJunchao Zhang     ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1355cd620004SJunchao Zhang     if (ScatterAndOp) {
1356fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1357fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1358fcc7397dSJunchao Zhang       ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr);
1359cd620004SJunchao Zhang     } else {
1360fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1361fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1362fcc7397dSJunchao Zhang       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr);
1363cd620004SJunchao Zhang     }
1364cd620004SJunchao Zhang   }
1365cd620004SJunchao Zhang   PetscFunctionReturn(0);
1366cd620004SJunchao Zhang }
1367cd620004SJunchao Zhang 
1368cd620004SJunchao Zhang /* Reduce leafdata to rootdata locally */
1369cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1370cd620004SJunchao Zhang {
1371cd620004SJunchao Zhang   PetscErrorCode       ierr;
1372cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1373cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1374cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1375fcc7397dSJunchao Zhang   PetscErrorCode       (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1376cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1377fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1378cd620004SJunchao Zhang 
1379cd620004SJunchao Zhang   PetscFunctionBegin;
1380cd620004SJunchao Zhang   if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1381cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1382cd620004SJunchao Zhang     /* The local communication has to go through pack and unpack */
1383cd620004SJunchao Zhang     ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr);
138420c24465SJunchao Zhang     ierr = (*link->Memcpy)(link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr);
1385cd620004SJunchao Zhang     ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr);
1386cd620004SJunchao Zhang   } else {
1387cd620004SJunchao Zhang     ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr);
1388cd620004SJunchao Zhang     if (ScatterAndOp) {
1389fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1390fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1391fcc7397dSJunchao Zhang       ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr);
1392cd620004SJunchao Zhang     } else {
1393fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1394fcc7397dSJunchao Zhang       ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1395fcc7397dSJunchao Zhang       ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr);
1396cd620004SJunchao Zhang     }
1397cd620004SJunchao Zhang   }
1398cd620004SJunchao Zhang   PetscFunctionReturn(0);
1399cd620004SJunchao Zhang }
1400cd620004SJunchao Zhang 
1401cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */
1402cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1403cd620004SJunchao Zhang {
1404cd620004SJunchao Zhang   PetscErrorCode       ierr;
1405cd620004SJunchao Zhang   const PetscInt       *rootindices = NULL,*leafindices = NULL;
1406cd620004SJunchao Zhang   PetscInt             count,rootstart,leafstart;
1407cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
1408fcc7397dSJunchao Zhang   PetscErrorCode       (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1409cd620004SJunchao Zhang   const PetscMemType   rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1410fcc7397dSJunchao Zhang   PetscSFPackOpt       leafopt = NULL,rootopt = NULL;
1411cd620004SJunchao Zhang 
1412cd620004SJunchao Zhang   PetscFunctionBegin;
1413cd620004SJunchao Zhang   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0);
1414cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1415cd620004SJunchao Zhang    /* The local communication has to go through pack and unpack */
1416cd620004SJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1417cd620004SJunchao Zhang   } else {
1418fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr);
1419fcc7397dSJunchao Zhang     ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr);
1420cd620004SJunchao Zhang     ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr);
1421fcc7397dSJunchao Zhang     ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr);
1422cd620004SJunchao Zhang   }
142340e23c03SJunchao Zhang   PetscFunctionReturn(0);
142440e23c03SJunchao Zhang }
142540e23c03SJunchao Zhang 
142640e23c03SJunchao Zhang /*
1427cd620004SJunchao Zhang   Create per-rank pack/unpack optimizations based on indice patterns
142840e23c03SJunchao Zhang 
142940e23c03SJunchao Zhang    Input Parameters:
1430fcc7397dSJunchao Zhang   +  n       - Number of destination ranks
1431eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1432b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
143340e23c03SJunchao Zhang 
143440e23c03SJunchao Zhang    Output Parameters:
1435cd620004SJunchao Zhang   +  opt     - Pack optimizations. NULL if no optimizations.
143640e23c03SJunchao Zhang */
1437cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
143840e23c03SJunchao Zhang {
143940e23c03SJunchao Zhang   PetscErrorCode ierr;
1440fcc7397dSJunchao Zhang   PetscInt       r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1441fcc7397dSJunchao Zhang   PetscBool      optimizable = PETSC_TRUE;
144240e23c03SJunchao Zhang   PetscSFPackOpt opt;
144340e23c03SJunchao Zhang 
144440e23c03SJunchao Zhang   PetscFunctionBegin;
1445fcc7397dSJunchao Zhang   ierr   = PetscMalloc1(1,&opt);CHKERRQ(ierr);
1446fcc7397dSJunchao Zhang   ierr   = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr);
1447fcc7397dSJunchao Zhang   opt->n      = opt->array[0] = n;
1448fcc7397dSJunchao Zhang   opt->offset = opt->array + 1;
1449fcc7397dSJunchao Zhang   opt->start  = opt->array + n   + 2;
1450fcc7397dSJunchao Zhang   opt->dx     = opt->array + 2*n + 2;
1451fcc7397dSJunchao Zhang   opt->dy     = opt->array + 3*n + 2;
1452fcc7397dSJunchao Zhang   opt->dz     = opt->array + 4*n + 2;
1453fcc7397dSJunchao Zhang   opt->X      = opt->array + 5*n + 2;
1454fcc7397dSJunchao Zhang   opt->Y      = opt->array + 6*n + 2;
1455fcc7397dSJunchao Zhang 
1456fcc7397dSJunchao Zhang   for (r=0; r<n; r++) { /* For each destination rank */
1457fcc7397dSJunchao Zhang     m     = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1458fcc7397dSJunchao Zhang     p     = offset[r];
1459fcc7397dSJunchao Zhang     start = idx[p]; /* First index for this rank */
1460fcc7397dSJunchao Zhang     p++;
1461fcc7397dSJunchao Zhang 
1462fcc7397dSJunchao Zhang     /* Search in X dimension */
1463fcc7397dSJunchao Zhang     for (dx=1; dx<m; dx++,p++) {
1464fcc7397dSJunchao Zhang       if (start+dx != idx[p]) break;
1465b23bfdefSJunchao Zhang     }
1466b23bfdefSJunchao Zhang 
1467fcc7397dSJunchao Zhang     dydz = m/dx;
1468fcc7397dSJunchao Zhang     X    = dydz > 1 ? (idx[p]-start) : dx;
1469fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1470fcc7397dSJunchao Zhang     if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1471fcc7397dSJunchao Zhang     for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1472fcc7397dSJunchao Zhang       for (i=0; i<dx; i++,p++) {
1473fcc7397dSJunchao Zhang         if (start+X*dy+i != idx[p]) {
1474fcc7397dSJunchao Zhang           if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1475fcc7397dSJunchao Zhang           else goto Z_dimension;
147640e23c03SJunchao Zhang         }
147740e23c03SJunchao Zhang       }
147840e23c03SJunchao Zhang     }
147940e23c03SJunchao Zhang 
1480fcc7397dSJunchao Zhang Z_dimension:
1481fcc7397dSJunchao Zhang     dz = m/(dx*dy);
1482fcc7397dSJunchao Zhang     Y  = dz > 1 ? (idx[p]-start)/X : dy;
1483fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1484fcc7397dSJunchao Zhang     if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1485fcc7397dSJunchao Zhang     for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1486fcc7397dSJunchao Zhang       for (j=0; j<dy; j++) {
1487fcc7397dSJunchao Zhang         for (i=0; i<dx; i++,p++) {
1488fcc7397dSJunchao Zhang           if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
148940e23c03SJunchao Zhang         }
149040e23c03SJunchao Zhang       }
149140e23c03SJunchao Zhang     }
1492fcc7397dSJunchao Zhang     opt->start[r] = start;
1493fcc7397dSJunchao Zhang     opt->dx[r]    = dx;
1494fcc7397dSJunchao Zhang     opt->dy[r]    = dy;
1495fcc7397dSJunchao Zhang     opt->dz[r]    = dz;
1496fcc7397dSJunchao Zhang     opt->X[r]     = X;
1497fcc7397dSJunchao Zhang     opt->Y[r]     = Y;
149840e23c03SJunchao Zhang   }
149940e23c03SJunchao Zhang 
1500fcc7397dSJunchao Zhang finish:
1501fcc7397dSJunchao Zhang   /* If not optimizable, free arrays to save memory */
1502fcc7397dSJunchao Zhang   if (!n || !optimizable) {
1503fcc7397dSJunchao Zhang     ierr = PetscFree(opt->array);CHKERRQ(ierr);
150440e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
150540e23c03SJunchao Zhang     *out = NULL;
1506fcc7397dSJunchao Zhang   } else {
1507fcc7397dSJunchao Zhang     opt->offset[0] = 0;
1508fcc7397dSJunchao Zhang     for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1509fcc7397dSJunchao Zhang     *out = opt;
1510fcc7397dSJunchao Zhang   }
151140e23c03SJunchao Zhang   PetscFunctionReturn(0);
151240e23c03SJunchao Zhang }
151340e23c03SJunchao Zhang 
151420c24465SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
151540e23c03SJunchao Zhang {
151640e23c03SJunchao Zhang   PetscErrorCode ierr;
151740e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
151840e23c03SJunchao Zhang 
151940e23c03SJunchao Zhang   PetscFunctionBegin;
152040e23c03SJunchao Zhang   if (opt) {
152120c24465SJunchao Zhang     ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr);
152240e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
152340e23c03SJunchao Zhang     *out = NULL;
152440e23c03SJunchao Zhang   }
152540e23c03SJunchao Zhang   PetscFunctionReturn(0);
152640e23c03SJunchao Zhang }
1527cd620004SJunchao Zhang 
1528cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1529cd620004SJunchao Zhang {
1530cd620004SJunchao Zhang   PetscErrorCode ierr;
1531cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1532cd620004SJunchao Zhang   PetscInt       i,j;
1533cd620004SJunchao Zhang 
1534cd620004SJunchao Zhang   PetscFunctionBegin;
1535cd620004SJunchao Zhang   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1536cd620004SJunchao Zhang   for (i=0; i<2; i++) { /* Set defaults */
1537cd620004SJunchao Zhang     sf->leafstart[i]   = 0;
1538cd620004SJunchao Zhang     sf->leafcontig[i]  = PETSC_TRUE;
1539cd620004SJunchao Zhang     sf->leafdups[i]    = PETSC_FALSE;
1540cd620004SJunchao Zhang     bas->rootstart[i]  = 0;
1541cd620004SJunchao Zhang     bas->rootcontig[i] = PETSC_TRUE;
1542cd620004SJunchao Zhang     bas->rootdups[i]   = PETSC_FALSE;
1543cd620004SJunchao Zhang   }
1544cd620004SJunchao Zhang 
1545cd620004SJunchao Zhang   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1546cd620004SJunchao Zhang   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1547cd620004SJunchao Zhang 
1548cd620004SJunchao Zhang   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1549cd620004SJunchao Zhang   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1550cd620004SJunchao Zhang 
1551cd620004SJunchao Zhang   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1552cd620004SJunchao Zhang   for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1553cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1554cd620004SJunchao Zhang   }
1555cd620004SJunchao Zhang   for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1556cd620004SJunchao Zhang     if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1557cd620004SJunchao Zhang   }
1558cd620004SJunchao Zhang 
1559cd620004SJunchao Zhang   /* If not, see if we can have per-rank optimizations by doing index analysis */
1560cd620004SJunchao Zhang   if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks,            sf->roffset,             sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);}
1561cd620004SJunchao Zhang   if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);}
1562cd620004SJunchao Zhang 
1563cd620004SJunchao Zhang   /* Are root indices for self and remote contiguous? */
1564cd620004SJunchao Zhang   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1565cd620004SJunchao Zhang   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1566cd620004SJunchao Zhang 
1567cd620004SJunchao Zhang   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1568cd620004SJunchao Zhang   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1569cd620004SJunchao Zhang 
1570cd620004SJunchao Zhang   for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1571cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1572cd620004SJunchao Zhang   }
1573cd620004SJunchao Zhang   for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1574cd620004SJunchao Zhang     if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1575cd620004SJunchao Zhang   }
1576cd620004SJunchao Zhang 
1577cd620004SJunchao Zhang   if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks,              bas->ioffset,               bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);}
1578cd620004SJunchao Zhang   if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);}
1579cd620004SJunchao Zhang 
15807fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
1581cd620004SJunchao Zhang     /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1582cd620004SJunchao Zhang   if (!sf->leafcontig[0])  {ierr = PetscCheckDupsInt(sf->leafbuflen[0],  sf->rmine,                                 &sf->leafdups[0]);CHKERRQ(ierr);}
1583cd620004SJunchao Zhang   if (!sf->leafcontig[1])  {ierr = PetscCheckDupsInt(sf->leafbuflen[1],  sf->rmine+sf->roffset[sf->ndranks],        &sf->leafdups[1]);CHKERRQ(ierr);}
1584cd620004SJunchao Zhang   if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc,                             &bas->rootdups[0]);CHKERRQ(ierr);}
1585cd620004SJunchao Zhang   if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);}
1586cd620004SJunchao Zhang #endif
1587cd620004SJunchao Zhang 
1588cd620004SJunchao Zhang   PetscFunctionReturn(0);
1589cd620004SJunchao Zhang }
1590cd620004SJunchao Zhang 
1591cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1592cd620004SJunchao Zhang {
1593cd620004SJunchao Zhang   PetscErrorCode ierr;
1594cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
1595cd620004SJunchao Zhang   PetscInt       i;
1596cd620004SJunchao Zhang 
1597cd620004SJunchao Zhang   PetscFunctionBegin;
1598cd620004SJunchao Zhang   for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
159920c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr);
160020c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr);
16017fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
160220c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr);
160320c24465SJunchao Zhang     ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr);
16047fd2d3dbSJunchao Zhang #endif
1605cd620004SJunchao Zhang   }
1606cd620004SJunchao Zhang   PetscFunctionReturn(0);
1607cd620004SJunchao Zhang }
1608