xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision fb61b9e4c10773d1e08e7a826aed0c51ec8ec9ed)
140e23c03SJunchao Zhang 
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
5eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
6eb02082bSJunchao Zhang #include <cuda_runtime.h>
7eb02082bSJunchao Zhang #endif
840e23c03SJunchao Zhang /*
940e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
1040e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1140e23c03SJunchao Zhang  */
1240e23c03SJunchao Zhang 
13b23bfdefSJunchao Zhang #define CPPJoin2(a,b)         a ##_## b
14b23bfdefSJunchao Zhang #define CPPJoin3(a,b,c)       a ##_## b ##_## c
15b23bfdefSJunchao Zhang #define CPPJoin4_(a,b,c,d)    a##b##_##c##_##d
16b23bfdefSJunchao Zhang #define CPPJoin4(a,b,c,d)     CPPJoin4_(a##_,b,c,d)
1740e23c03SJunchao Zhang 
1840e23c03SJunchao Zhang #define EXECUTE(statement)    statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */
1940e23c03SJunchao Zhang #define IGNORE(statement)     do {} while(0)
2040e23c03SJunchao Zhang 
2140e23c03SJunchao Zhang #define BINARY_OP(r,s,op,t)   do {(r) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
2240e23c03SJunchao Zhang #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
23eb02082bSJunchao Zhang #define LXOR_OP(r,s,op,t)     do {(r) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
2440e23c03SJunchao Zhang #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0)
2540e23c03SJunchao Zhang 
26b23bfdefSJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */
2740e23c03SJunchao Zhang 
2840e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
2940e23c03SJunchao Zhang 
3040e23c03SJunchao Zhang    Arguments of the macro:
31b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
32b23bfdefSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bs.
33b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
3440e23c03SJunchao Zhang 
3540e23c03SJunchao Zhang    Arguments of the Pack routine:
36b23bfdefSJunchao Zhang    +count     Number of indices in idx[]
37b23bfdefSJunchao Zhang    .idx       Indices of entries to packed. NULL means contiguous indices, that is [0,count)
38eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39b23bfdefSJunchao Zhang    .opt       Pack optimization plans. NULL means no plan at all.
40b23bfdefSJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count)
41b23bfdefSJunchao Zhang    -packed    Address of the packed data for each rank
4240e23c03SJunchao Zhang  */
43b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \
44eb02082bSJunchao Zhang   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,const void *unpacked,void *packed) \
45b23bfdefSJunchao Zhang   {                                                                                                          \
4640e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
47b23bfdefSJunchao Zhang     const Type     *u = (const Type*)unpacked,*u2;                                                           \
48b23bfdefSJunchao Zhang     Type           *p = (Type*)packed,*p2;                                                                   \
49eb02082bSJunchao Zhang     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
50b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
51b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
5240e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
53b23bfdefSJunchao Zhang     if (!idx) {ierr = PetscArraycpy(p,u,MBS*count);CHKERRQ(ierr);}  /* Indices are contiguous */             \
54b23bfdefSJunchao Zhang     else if (!opt) { /* No optimizations available */                                                        \
55b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
56eb02082bSJunchao Zhang         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
57eb02082bSJunchao Zhang           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
58b23bfdefSJunchao Zhang             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
59b23bfdefSJunchao Zhang     } else {                                                                                                 \
60b23bfdefSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
61b23bfdefSJunchao Zhang         p2  = p + opt->offset[r]*MBS;                                                                        \
62b23bfdefSJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
63b23bfdefSJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
64b23bfdefSJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
65b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
66b23bfdefSJunchao Zhang               for (k=0; k<BS; k++)                                                                           \
67b23bfdefSJunchao Zhang                 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k];                                                    \
68b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
6940e23c03SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
70b23bfdefSJunchao Zhang             u2   = u + idx[opt->copy_start[i]]*MBS;                                                          \
71b23bfdefSJunchao Zhang             l    = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */                       \
72b23bfdefSJunchao Zhang             ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr);                                                     \
73b23bfdefSJunchao Zhang             p2  += l;                                                                                        \
7440e23c03SJunchao Zhang           }                                                                                                  \
75b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
76b23bfdefSJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
7740e23c03SJunchao Zhang           step = opt->stride_step[r];                                                                        \
7840e23c03SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
79b23bfdefSJunchao Zhang             for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j];                                            \
80b23bfdefSJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
8140e23c03SJunchao Zhang       }                                                                                                      \
8240e23c03SJunchao Zhang     }                                                                                                        \
8340e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
8440e23c03SJunchao Zhang   }
8540e23c03SJunchao Zhang 
8640e23c03SJunchao Zhang /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine
8740e23c03SJunchao Zhang 
8840e23c03SJunchao Zhang    Arguments:
8940e23c03SJunchao Zhang   +action     Unpack or Fetch
90b23bfdefSJunchao Zhang   .Type       Type of the data
9140e23c03SJunchao Zhang   .BS         Block size for vectorization
92b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
9340e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
94b23bfdefSJunchao Zhang   .CType      Type with or without the const qualifier, i.e., const Type or Type
95b23bfdefSJunchao Zhang   .Cvoid      void with or without the const qualifier, i.e., const void or void
9640e23c03SJunchao Zhang 
9740e23c03SJunchao Zhang   Notes:
9840e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
99b23bfdefSJunchao Zhang    The two arguments CType and Cvoid are used (instead of one constness argument), because we want to
10040e23c03SJunchao Zhang    get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument,
10140e23c03SJunchao Zhang    sometimes we input 'const', sometimes we have to input empty.
102b23bfdefSJunchao Zhang 
103b23bfdefSJunchao Zhang    If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare.
10440e23c03SJunchao Zhang  */
105b23bfdefSJunchao Zhang #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid)               \
106eb02082bSJunchao Zhang   static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
107b23bfdefSJunchao Zhang   {                                                                                                          \
10840e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
109b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
110b23bfdefSJunchao Zhang     CType          *p = (CType*)packed,*p2;                                                                  \
111eb02082bSJunchao Zhang     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
112b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
113b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
11440e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
115b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
116b23bfdefSJunchao Zhang       FILTER(Type *v);                                                                                       \
117b23bfdefSJunchao Zhang       FILTER(ierr = PetscMalloc1(count*MBS,&v);CHKERRQ(ierr));                                               \
118b23bfdefSJunchao Zhang       FILTER(ierr = PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr));                                             \
119b23bfdefSJunchao Zhang              ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);                                              \
120b23bfdefSJunchao Zhang       FILTER(ierr = PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr));                                             \
12140e23c03SJunchao Zhang       FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                             \
122b23bfdefSJunchao Zhang     } else if (!opt) { /* No optimizations available */                                                      \
123b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
124b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
125b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
126b23bfdefSJunchao Zhang             FILTER(Type t                = u[idx[i]*MBS+j*BS+k]);                                            \
127b23bfdefSJunchao Zhang                    u[idx[i]*MBS+j*BS+k]  = p[i*MBS+j*BS+k];                                                  \
128b23bfdefSJunchao Zhang             FILTER(p[i*MBS+j*BS+k]       = t);                                                               \
12940e23c03SJunchao Zhang           }                                                                                                  \
130b23bfdefSJunchao Zhang     } else {                                                                                                 \
131b23bfdefSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
132b23bfdefSJunchao Zhang         p2 = p + opt->offset[r]*MBS;                                                                         \
133b23bfdefSJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
134b23bfdefSJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
135b23bfdefSJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
136b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
137b23bfdefSJunchao Zhang               for (k=0; k<BS; k++) {                                                                         \
138b23bfdefSJunchao Zhang                 FILTER(Type t                = u[idx2[i]*MBS+j*BS+k]);                                       \
139b23bfdefSJunchao Zhang                        u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
140b23bfdefSJunchao Zhang                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
14140e23c03SJunchao Zhang               }                                                                                              \
142b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
143b23bfdefSJunchao Zhang           FILTER(Type *v);                                                                                   \
144b23bfdefSJunchao Zhang           FILTER(ierr = PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \
14540e23c03SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
146b23bfdefSJunchao Zhang             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
147b23bfdefSJunchao Zhang             l  = opt->copy_length[i]*MBS;                                                                    \
148da2e4c71SJunchao Zhang             FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr));                                              \
149b23bfdefSJunchao Zhang                    ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr);                                              \
150b23bfdefSJunchao Zhang             FILTER(ierr = PetscArraycpy(p2,v,l);CHKERRQ(ierr));                                              \
151b23bfdefSJunchao Zhang             p2 += l;                                                                                         \
15240e23c03SJunchao Zhang           }                                                                                                  \
15340e23c03SJunchao Zhang           FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                         \
154b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
155b23bfdefSJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
15640e23c03SJunchao Zhang           step = opt->stride_step[r];                                                                        \
15740e23c03SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
158b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
159b23bfdefSJunchao Zhang               for (k=0; k<BS; k++) {                                                                         \
160b23bfdefSJunchao Zhang                 FILTER(Type t                = u2[i*step*MBS+j*BS+k]);                                       \
161b23bfdefSJunchao Zhang                        u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
162b23bfdefSJunchao Zhang                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
16340e23c03SJunchao Zhang               }                                                                                              \
164b23bfdefSJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
16540e23c03SJunchao Zhang       }                                                                                                      \
16640e23c03SJunchao Zhang     }                                                                                                        \
16740e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
16840e23c03SJunchao Zhang   }
16940e23c03SJunchao Zhang 
17040e23c03SJunchao Zhang /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc
17140e23c03SJunchao Zhang 
17240e23c03SJunchao Zhang    Arguments:
17340e23c03SJunchao Zhang   +action     Unpack or Fetch
17440e23c03SJunchao Zhang   .opname     Name of the Op, such as Add, Mult, LAND, etc.
175b23bfdefSJunchao Zhang   .Type       Type of the data
17640e23c03SJunchao Zhang   .BS         Block size for vectorization
177b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
17840e23c03SJunchao Zhang   .op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
17940e23c03SJunchao Zhang   .APPLY      Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP
18040e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
181b23bfdefSJunchao Zhang   .CType      Type with or without the const qualifier, i.e., const Type or Type
182b23bfdefSJunchao Zhang   -Cvoid      void with or without the const qualifier, i.e., const void or void
18340e23c03SJunchao Zhang  */
184b23bfdefSJunchao Zhang #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \
185eb02082bSJunchao Zhang   static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
186b23bfdefSJunchao Zhang   {                                                                                                          \
187b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2,t;                                                               \
188b23bfdefSJunchao Zhang     CType          *p = (CType*)packed,*p2;                                                                  \
189eb02082bSJunchao Zhang     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
190b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
191b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
19240e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
193b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
194b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
195b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
196b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
197b23bfdefSJunchao Zhang             t    = u[i*MBS+j*BS+k];                                                                          \
198b23bfdefSJunchao Zhang             APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                                    \
199b23bfdefSJunchao Zhang             FILTER(p[i*MBS+j*BS+k] = t);                                                                     \
20040e23c03SJunchao Zhang           }                                                                                                  \
201b23bfdefSJunchao Zhang     } else if (!opt) { /* No optimizations available */                                                      \
202b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
203b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
204b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
205b23bfdefSJunchao Zhang               t    = u[idx[i]*MBS+j*BS+k];                                                                   \
206b23bfdefSJunchao Zhang               APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                             \
207b23bfdefSJunchao Zhang               FILTER(p[i*MBS+j*BS+k] = t);                                                                   \
20840e23c03SJunchao Zhang           }                                                                                                  \
209b23bfdefSJunchao Zhang     } else {                                                                                                 \
210b23bfdefSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
211b23bfdefSJunchao Zhang         p2 = p + opt->offset[r]*MBS;                                                                         \
212b23bfdefSJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
213b23bfdefSJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
214b23bfdefSJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
215b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
216b23bfdefSJunchao Zhang               for (k=0; k<BS; k++) {                                                                         \
217b23bfdefSJunchao Zhang                 t    = u[idx2[i]*MBS+j*BS+k];                                                                \
218b23bfdefSJunchao Zhang                 APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
219b23bfdefSJunchao Zhang                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
22040e23c03SJunchao Zhang               }                                                                                              \
221b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
22240e23c03SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
223b23bfdefSJunchao Zhang             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
224b23bfdefSJunchao Zhang             l  = opt->copy_length[i]*MBS;                                                                    \
22540e23c03SJunchao Zhang             for (j=0; j<l; j++) {                                                                            \
22640e23c03SJunchao Zhang               t    = u2[j];                                                                                  \
227b23bfdefSJunchao Zhang               APPLY (u2[j],t,op,p2[j]);                                                                      \
228b23bfdefSJunchao Zhang               FILTER(p2[j] = t);                                                                             \
22940e23c03SJunchao Zhang             }                                                                                                \
230b23bfdefSJunchao Zhang             p2 += l;                                                                                         \
23140e23c03SJunchao Zhang           }                                                                                                  \
232b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
233b23bfdefSJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
23440e23c03SJunchao Zhang           step = opt->stride_step[r];                                                                        \
23540e23c03SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
236b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
237b23bfdefSJunchao Zhang               for (k=0; k<BS; k++) {                                                                         \
238b23bfdefSJunchao Zhang                 t    = u2[i*step*MBS+j*BS+k];                                                                \
239b23bfdefSJunchao Zhang                 APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
240b23bfdefSJunchao Zhang                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
24140e23c03SJunchao Zhang               }                                                                                              \
242b23bfdefSJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
24340e23c03SJunchao Zhang       }                                                                                                      \
24440e23c03SJunchao Zhang     }                                                                                                        \
24540e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
24640e23c03SJunchao Zhang   }
24740e23c03SJunchao Zhang 
24840e23c03SJunchao Zhang /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine
24940e23c03SJunchao Zhang 
25040e23c03SJunchao Zhang    Arguments:
25140e23c03SJunchao Zhang   +Action     Unpack or Fetch
25240e23c03SJunchao Zhang   .locname    Max or Min
25340e23c03SJunchao Zhang   .type1      Type of the first data in a pair type
25440e23c03SJunchao Zhang   .type2      Type of the second data in a pair type, usually PetscMPIInt for MPI ranks.
25540e23c03SJunchao Zhang   .op         > or <
25640e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
257b23bfdefSJunchao Zhang   .CType      Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2)
258b23bfdefSJunchao Zhang   -Cvoid      void with or without the const qualifier, i.e., const void or void
25940e23c03SJunchao Zhang  */
260b23bfdefSJunchao Zhang #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \
261eb02082bSJunchao Zhang   static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \
262b23bfdefSJunchao Zhang     PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked;                                             \
263b23bfdefSJunchao Zhang     CType                 *p = (CType*)packed;                                                               \
264b23bfdefSJunchao Zhang     PetscInt              i,j;                                                                               \
265b23bfdefSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
266eb02082bSJunchao Zhang       FILTER(PairType(Type1,Type2) v);                                                                       \
267b23bfdefSJunchao Zhang       j = idx? idx[i] : i;                                                                                   \
268eb02082bSJunchao Zhang       FILTER(v = u[j]);                                                                                      \
26940e23c03SJunchao Zhang       if (p[i].a op u[j].a) {                                                                                \
27040e23c03SJunchao Zhang         u[j] = p[i];                                                                                         \
27140e23c03SJunchao Zhang       } else if (p[i].a == u[j].a) {                                                                         \
27240e23c03SJunchao Zhang         u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */                                 \
27340e23c03SJunchao Zhang       }                                                                                                      \
27440e23c03SJunchao Zhang       FILTER(p[i] = v);                                                                                      \
27540e23c03SJunchao Zhang     }                                                                                                        \
27640e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
27740e23c03SJunchao Zhang   }
27840e23c03SJunchao Zhang 
279b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
280b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ)                                                                   \
281b23bfdefSJunchao Zhang   DEF_PackFunc(Type,BS,EQ)                                                                     \
282b23bfdefSJunchao Zhang   DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void)                                   \
283b23bfdefSJunchao Zhang   DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void)                                              \
284b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) {                           \
285eb02082bSJunchao Zhang     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);                            \
286eb02082bSJunchao Zhang     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);                            \
287eb02082bSJunchao Zhang     link->h_FetchAndInsert  = CPPJoin4(FetchAndInsert, Type,BS,EQ);                            \
28840e23c03SJunchao Zhang   }
28940e23c03SJunchao Zhang 
290b23bfdefSJunchao Zhang /* Add, Mult ops */
291b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ)                                                                    \
292b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void)             \
293b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void)             \
294b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void)                        \
295b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void)                        \
296b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) {                            \
297eb02082bSJunchao Zhang     link->h_UnpackAndAdd    = CPPJoin4(UnpackAndAdd,   Type,BS,EQ);                            \
298eb02082bSJunchao Zhang     link->h_UnpackAndMult   = CPPJoin4(UnpackAndMult,  Type,BS,EQ);                            \
299eb02082bSJunchao Zhang     link->h_FetchAndAdd     = CPPJoin4(FetchAndAdd,    Type,BS,EQ);                            \
300eb02082bSJunchao Zhang     link->h_FetchAndMult    = CPPJoin4(FetchAndMult,   Type,BS,EQ);                            \
30140e23c03SJunchao Zhang   }
30240e23c03SJunchao Zhang 
303b23bfdefSJunchao Zhang /* Max, Min ops */
304b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ)                                                                    \
305b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void)     \
306b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void)     \
307b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void)                \
308b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void)                \
309b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) {                        \
310eb02082bSJunchao Zhang     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);                            \
311eb02082bSJunchao Zhang     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);                            \
312eb02082bSJunchao Zhang     link->h_FetchAndMax     = CPPJoin4(FetchAndMax ,   Type,BS,EQ);                            \
313eb02082bSJunchao Zhang     link->h_FetchAndMin     = CPPJoin4(FetchAndMin ,   Type,BS,EQ);                            \
314b23bfdefSJunchao Zhang   }
315b23bfdefSJunchao Zhang 
316b23bfdefSJunchao Zhang /* Logical ops.
317b23bfdefSJunchao Zhang   The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid
31840e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
31940e23c03SJunchao Zhang  */
320b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ)                                                                    \
321b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void)            \
322b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void)            \
323b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP,  IGNORE,const Type,const void)            \
324b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void)                       \
325b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void)                       \
326b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP,  EXECUTE,Type,void)                       \
327b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) {                        \
328eb02082bSJunchao Zhang     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND,Type,BS,EQ);                              \
329eb02082bSJunchao Zhang     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR, Type,BS,EQ);                              \
330eb02082bSJunchao Zhang     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR,Type,BS,EQ);                              \
331eb02082bSJunchao Zhang     link->h_FetchAndLAND    = CPPJoin4(FetchAndLAND, Type,BS,EQ);                              \
332eb02082bSJunchao Zhang     link->h_FetchAndLOR     = CPPJoin4(FetchAndLOR,  Type,BS,EQ);                              \
333eb02082bSJunchao Zhang     link->h_FetchAndLXOR    = CPPJoin4(FetchAndLXOR, Type,BS,EQ);                              \
33440e23c03SJunchao Zhang   }
33540e23c03SJunchao Zhang 
336b23bfdefSJunchao Zhang /* Bitwise ops */
337b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ)                                                                    \
338b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void)             \
339b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void)             \
340b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void)             \
341b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void)                        \
342b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void)                        \
343b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void)                        \
344b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) {                        \
345eb02082bSJunchao Zhang     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND,Type,BS,EQ);                              \
346eb02082bSJunchao Zhang     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR, Type,BS,EQ);                              \
347eb02082bSJunchao Zhang     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR,Type,BS,EQ);                              \
348eb02082bSJunchao Zhang     link->h_FetchAndBAND    = CPPJoin4(FetchAndBAND, Type,BS,EQ);                              \
349eb02082bSJunchao Zhang     link->h_FetchAndBOR     = CPPJoin4(FetchAndBOR,  Type,BS,EQ);                              \
350eb02082bSJunchao Zhang     link->h_FetchAndBXOR    = CPPJoin4(FetchAndBXOR, Type,BS,EQ);                              \
35140e23c03SJunchao Zhang   }
35240e23c03SJunchao Zhang 
353b23bfdefSJunchao Zhang /* Maxloc, Minloc */
354b23bfdefSJunchao Zhang #define DEF_Xloc(Type1,Type2)                                                                  \
355b23bfdefSJunchao Zhang   DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void)    \
356b23bfdefSJunchao Zhang   DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void)    \
357b23bfdefSJunchao Zhang   DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void)               \
358b23bfdefSJunchao Zhang   DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void)               \
359b23bfdefSJunchao Zhang   static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) {                          \
360eb02082bSJunchao Zhang     link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1);             \
361eb02082bSJunchao Zhang     link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1);             \
362eb02082bSJunchao Zhang     link->h_FetchAndMaxloc  = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1);             \
363eb02082bSJunchao Zhang     link->h_FetchAndMinloc  = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1);             \
36440e23c03SJunchao Zhang   }
36540e23c03SJunchao Zhang 
366b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ)                                                            \
367b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                                         \
368b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                                          \
369b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                                          \
370b23bfdefSJunchao Zhang   DEF_Log(Type,BS,EQ)                                                                          \
371b23bfdefSJunchao Zhang   DEF_Bit(Type,BS,EQ)                                                                          \
372b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) {                    \
373b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
374b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
375b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
376b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                               \
377b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                               \
37840e23c03SJunchao Zhang   }
37940e23c03SJunchao Zhang 
380b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ)                                                               \
381b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                                         \
382b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                                          \
383b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                                          \
384b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) {                       \
385b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
386b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
387b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
388b23bfdefSJunchao Zhang   }
38940e23c03SJunchao Zhang 
39040e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
391b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ)                                                            \
392b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                                         \
393b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                                          \
394b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) {                    \
395b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
396b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
397b23bfdefSJunchao Zhang   }
39840e23c03SJunchao Zhang #endif
399b23bfdefSJunchao Zhang 
400b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ)                                                               \
401b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                                         \
402b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) {                       \
403b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
404b23bfdefSJunchao Zhang   }
405b23bfdefSJunchao Zhang 
406b23bfdefSJunchao Zhang /* Maxloc, Minloc */
407b23bfdefSJunchao Zhang #define DEF_PairType(Type1,Type2)                                                              \
408b23bfdefSJunchao Zhang   typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2);                                    \
409b23bfdefSJunchao Zhang   DEF_Pack(PairType(Type1,Type2),1,1)                                                          \
410b23bfdefSJunchao Zhang   DEF_Xloc(Type1,Type2)                                                                        \
411b23bfdefSJunchao Zhang   static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) {                      \
412b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link);                                   \
413b23bfdefSJunchao Zhang     CPPJoin3(PackInit_Xloc,Type1,Type2)(link);                                                 \
414b23bfdefSJunchao Zhang   }
415b23bfdefSJunchao Zhang 
416b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
417b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
418b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
419b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
420b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
421b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
422b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
423b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
424b23bfdefSJunchao Zhang 
425b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
426b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1)
427b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1)
428b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1)
429b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1)
430b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0)
431b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0)
432b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0)
433b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0)
434b23bfdefSJunchao Zhang #endif
435b23bfdefSJunchao Zhang 
436b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */
437b23bfdefSJunchao Zhang typedef signed char SignedChar;
438b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1)
439b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1)
440b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1)
441b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1)
442b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0)
443b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0)
444b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0)
445b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0)
446b23bfdefSJunchao Zhang 
447b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar;
448b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1)
449b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1)
450b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1)
451b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1)
452b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0)
453b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0)
454b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0)
455b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0)
456b23bfdefSJunchao Zhang 
457b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1)
458b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1)
459b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1)
460b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1)
461b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0)
462b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0)
463b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0)
464b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0)
465b23bfdefSJunchao Zhang 
466b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
467b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1)
468b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1)
469b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1)
470b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1)
471b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0)
472b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0)
473b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0)
474b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0)
475b23bfdefSJunchao Zhang #endif
476b23bfdefSJunchao Zhang 
477b23bfdefSJunchao Zhang DEF_PairType(int,int)
478b23bfdefSJunchao Zhang DEF_PairType(PetscInt,PetscInt)
479b23bfdefSJunchao Zhang 
480b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */
481b23bfdefSJunchao Zhang DEF_DumbType(char,1,1)
482b23bfdefSJunchao Zhang DEF_DumbType(char,2,1)
483b23bfdefSJunchao Zhang DEF_DumbType(char,4,1)
484b23bfdefSJunchao Zhang DEF_DumbType(char,1,0)
485b23bfdefSJunchao Zhang DEF_DumbType(char,2,0)
486b23bfdefSJunchao Zhang DEF_DumbType(char,4,0)
487b23bfdefSJunchao Zhang 
488eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
489b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1)
490b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1)
491b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1)
492b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1)
493b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0)
494b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0)
495b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0)
496b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0)
49740e23c03SJunchao Zhang 
49840e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
49940e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
50040e23c03SJunchao Zhang {
50140e23c03SJunchao Zhang   int ierr;
50240e23c03SJunchao Zhang   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
50340e23c03SJunchao Zhang   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
50440e23c03SJunchao Zhang   return MPI_SUCCESS;
50540e23c03SJunchao Zhang }
50640e23c03SJunchao Zhang #endif
50740e23c03SJunchao Zhang 
5089d1c8addSJunchao Zhang PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink)
50940e23c03SJunchao Zhang {
51040e23c03SJunchao Zhang   PetscErrorCode    ierr;
51140e23c03SJunchao Zhang   PetscSFPack       link,*p;
51240e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
51340e23c03SJunchao Zhang 
51440e23c03SJunchao Zhang   PetscFunctionBegin;
51540e23c03SJunchao Zhang   /* Look for types in cache */
51640e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
51740e23c03SJunchao Zhang     PetscBool match;
51840e23c03SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
5199d1c8addSJunchao Zhang     if (match && (rkey == link->rkey) && (lkey == link->lkey)) {
52040e23c03SJunchao Zhang       switch (cmode) {
52140e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
52240e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
52340e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
52440e23c03SJunchao Zhang       }
52540e23c03SJunchao Zhang       *mylink = link;
52640e23c03SJunchao Zhang       PetscFunctionReturn(0);
52740e23c03SJunchao Zhang     }
52840e23c03SJunchao Zhang   }
52940e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
53040e23c03SJunchao Zhang   PetscFunctionReturn(0);
53140e23c03SJunchao Zhang }
53240e23c03SJunchao Zhang 
53340e23c03SJunchao Zhang PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link)
53440e23c03SJunchao Zhang {
53540e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
53640e23c03SJunchao Zhang 
53740e23c03SJunchao Zhang   PetscFunctionBegin;
5389d1c8addSJunchao Zhang   (*link)->rkey = NULL;
5399d1c8addSJunchao Zhang   (*link)->lkey = NULL;
54040e23c03SJunchao Zhang   (*link)->next = bas->avail;
54140e23c03SJunchao Zhang   bas->avail    = *link;
54240e23c03SJunchao Zhang   *link         = NULL;
54340e23c03SJunchao Zhang   PetscFunctionReturn(0);
54440e23c03SJunchao Zhang }
54540e23c03SJunchao Zhang 
546eb02082bSJunchao Zhang /* Destroy all links, i.e., PetscSFPacks in the linked list, usually named 'avail' */
54764f49babSJed Brown PetscErrorCode PetscSFPackDestroyAvailable(PetscSFPack *avail)
548eb02082bSJunchao Zhang {
549eb02082bSJunchao Zhang   PetscErrorCode    ierr;
550eb02082bSJunchao Zhang   PetscSFPack       link=*avail,next;
551eb02082bSJunchao Zhang   PetscInt          i;
552eb02082bSJunchao Zhang 
553eb02082bSJunchao Zhang   PetscFunctionBegin;
554eb02082bSJunchao Zhang   for (; link; link=next) {
555eb02082bSJunchao Zhang     next = link->next;
556eb02082bSJunchao Zhang     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
557eb02082bSJunchao Zhang     for (i=0; i<(link->nrootreqs+link->nleafreqs)*4; i++) { /* Persistent reqs must be freed. */
558eb02082bSJunchao Zhang       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
559eb02082bSJunchao Zhang     }
560eb02082bSJunchao Zhang     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
561eb02082bSJunchao Zhang     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
562eb02082bSJunchao Zhang     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
563eb02082bSJunchao Zhang     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->selfbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
564eb02082bSJunchao Zhang 
565eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
566eb02082bSJunchao Zhang     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
567eb02082bSJunchao Zhang     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
568eb02082bSJunchao Zhang     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->selfbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
569eb02082bSJunchao Zhang     if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
570eb02082bSJunchao Zhang #endif
571eb02082bSJunchao Zhang     ierr = PetscFree(link);CHKERRQ(ierr);
572eb02082bSJunchao Zhang   }
573eb02082bSJunchao Zhang   *avail = NULL;
574eb02082bSJunchao Zhang   PetscFunctionReturn(0);
575eb02082bSJunchao Zhang }
576eb02082bSJunchao Zhang 
5779d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
5789d1c8addSJunchao Zhang PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey)
5799d1c8addSJunchao Zhang {
5809d1c8addSJunchao Zhang   PetscErrorCode    ierr;
5819d1c8addSJunchao Zhang   PetscSFPack       link,*p;
5829d1c8addSJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
5839d1c8addSJunchao Zhang   PetscBool         match;
5849d1c8addSJunchao Zhang 
5859d1c8addSJunchao Zhang   PetscFunctionBegin;
58618fb5014SJunchao Zhang   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
58718fb5014SJunchao Zhang      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
58818fb5014SJunchao Zhang   */
58918fb5014SJunchao Zhang   if (rkey || lkey) {
5909d1c8addSJunchao Zhang     for (p=&bas->inuse; (link=*p); p=&link->next) {
5919d1c8addSJunchao Zhang       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
59233c49614SJunchao Zhang       if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey);
5939d1c8addSJunchao Zhang     }
59418fb5014SJunchao Zhang   }
5959d1c8addSJunchao Zhang   PetscFunctionReturn(0);
5969d1c8addSJunchao Zhang }
5979d1c8addSJunchao Zhang 
598eb02082bSJunchao Zhang PetscErrorCode PetscSFPackSetUp_Host(PetscSF sf,PetscSFPack link,MPI_Datatype unit)
59940e23c03SJunchao Zhang {
60040e23c03SJunchao Zhang   PetscErrorCode ierr;
601b23bfdefSJunchao Zhang   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
602b23bfdefSJunchao Zhang   PetscBool      is2Int,is2PetscInt;
60340e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
60440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
605b23bfdefSJunchao Zhang   PetscInt       nPetscComplex=0;
60640e23c03SJunchao Zhang #endif
60740e23c03SJunchao Zhang 
60840e23c03SJunchao Zhang   PetscFunctionBegin;
609b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
610b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
611b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
612b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
613b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
614b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
61540e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
616b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
61740e23c03SJunchao Zhang #endif
61840e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
61940e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
620b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
62140e23c03SJunchao Zhang   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
6225ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
623b23bfdefSJunchao Zhang   link->bs = 1; /* default */
62440e23c03SJunchao Zhang 
625eb02082bSJunchao Zhang   if (is2Int) {
626eb02082bSJunchao Zhang     PackInit_PairType_int_int(link);
627eb02082bSJunchao Zhang     link->bs        = 1;
628eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(int);
6295ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
630eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
6315ad15460SJunchao Zhang     link->unit      = MPI_2INT;
632eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
633eb02082bSJunchao Zhang     PackInit_PairType_PetscInt_PetscInt(link);
634eb02082bSJunchao Zhang     link->bs        = 1;
635eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(PetscInt);
636eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
6375ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
6385ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
639eb02082bSJunchao Zhang   } else if (nPetscReal) {
640b23bfdefSJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
641b23bfdefSJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
642b23bfdefSJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
643b23bfdefSJunchao Zhang     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
644b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
645eb02082bSJunchao Zhang     link->unitbytes = nPetscReal*sizeof(PetscReal);
64640e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
6475ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
648b23bfdefSJunchao Zhang   } else if (nPetscInt) {
649b23bfdefSJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
650b23bfdefSJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
651b23bfdefSJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
652b23bfdefSJunchao Zhang     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
653b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
654eb02082bSJunchao Zhang     link->unitbytes = nPetscInt*sizeof(PetscInt);
655b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
6565ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
657b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
658b23bfdefSJunchao Zhang   } else if (nInt) {
659b23bfdefSJunchao Zhang     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
660b23bfdefSJunchao Zhang     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
661b23bfdefSJunchao Zhang     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
662b23bfdefSJunchao Zhang     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
663b23bfdefSJunchao Zhang     link->bs        = nInt;
664eb02082bSJunchao Zhang     link->unitbytes = nInt*sizeof(int);
665b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
6665ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
667b23bfdefSJunchao Zhang #endif
668b23bfdefSJunchao Zhang   } else if (nSignedChar) {
669b23bfdefSJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
670b23bfdefSJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
671b23bfdefSJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
672b23bfdefSJunchao Zhang     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
673b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
674eb02082bSJunchao Zhang     link->unitbytes = nSignedChar*sizeof(SignedChar);
675b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
6765ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
677b23bfdefSJunchao Zhang   }  else if (nUnsignedChar) {
678b23bfdefSJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
679b23bfdefSJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
680b23bfdefSJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
681b23bfdefSJunchao Zhang     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
682b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
683eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
684b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
6855ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
68640e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
687b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
688b23bfdefSJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
689b23bfdefSJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
690b23bfdefSJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
691b23bfdefSJunchao Zhang     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
692b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
693eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
69440e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
6955ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
69640e23c03SJunchao Zhang #endif
69740e23c03SJunchao Zhang   } else {
698b23bfdefSJunchao Zhang     MPI_Aint lb,nbyte;
699b23bfdefSJunchao Zhang     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
70040e23c03SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
701eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
702eb02082bSJunchao Zhang       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
703eb02082bSJunchao Zhang       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
704eb02082bSJunchao Zhang       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
705eb02082bSJunchao Zhang       link->bs        = nbyte;
706b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
707b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
70840e23c03SJunchao Zhang     } else {
709eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
710eb02082bSJunchao Zhang       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
711eb02082bSJunchao Zhang       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
712eb02082bSJunchao Zhang       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
713eb02082bSJunchao Zhang       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
714eb02082bSJunchao Zhang       link->bs        = nInt;
715b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
71640e23c03SJunchao Zhang       link->basicunit = MPI_INT;
71740e23c03SJunchao Zhang     }
7185ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
71940e23c03SJunchao Zhang   }
720b23bfdefSJunchao Zhang 
7215ad15460SJunchao Zhang   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
72240e23c03SJunchao Zhang   PetscFunctionReturn(0);
72340e23c03SJunchao Zhang }
72440e23c03SJunchao Zhang 
725eb02082bSJunchao Zhang PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*))
72640e23c03SJunchao Zhang {
72740e23c03SJunchao Zhang   PetscFunctionBegin;
72840e23c03SJunchao Zhang   *UnpackAndOp = NULL;
729eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
730eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
731eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
732eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
733eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
734eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
735eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
736eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
737eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
738eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
739eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
740eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
741eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
742eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
743eb02082bSJunchao Zhang   }
744eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
745eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
746eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
747eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
748eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
749eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
750eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
751eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
752eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
753eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
754eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
755eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
756eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
757eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
758eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
759eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
760eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
761eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
762eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
763eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
764eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
765eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
766eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
767eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
768eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
769eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
770eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
771eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
772eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
773eb02082bSJunchao Zhang   }
774eb02082bSJunchao Zhang #endif
775eb02082bSJunchao Zhang   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);
776eb02082bSJunchao Zhang 
77740e23c03SJunchao Zhang   PetscFunctionReturn(0);
77840e23c03SJunchao Zhang }
77940e23c03SJunchao Zhang 
780eb02082bSJunchao Zhang PetscErrorCode PetscSFPackGetFetchAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*))
78140e23c03SJunchao Zhang {
78240e23c03SJunchao Zhang   PetscFunctionBegin;
78340e23c03SJunchao Zhang   *FetchAndOp = NULL;
784eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
785eb02082bSJunchao Zhang     if (op == MPIU_REPLACE)                   *FetchAndOp = link->h_FetchAndInsert;
786eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->h_FetchAndAdd;
787eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->h_FetchAndMax;
788eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->h_FetchAndMin;
789eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *FetchAndOp = link->h_FetchAndMaxloc;
790eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *FetchAndOp = link->h_FetchAndMinloc;
791eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *FetchAndOp = link->h_FetchAndMult;
792eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *FetchAndOp = link->h_FetchAndLAND;
793eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *FetchAndOp = link->h_FetchAndBAND;
794eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *FetchAndOp = link->h_FetchAndLOR;
795eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *FetchAndOp = link->h_FetchAndBOR;
796eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *FetchAndOp = link->h_FetchAndLXOR;
797eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *FetchAndOp = link->h_FetchAndBXOR;
798eb02082bSJunchao Zhang     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
799eb02082bSJunchao Zhang   }
800eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
801eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
802eb02082bSJunchao Zhang     if (op == MPIU_REPLACE)                   *FetchAndOp = link->d_FetchAndInsert;
803eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->d_FetchAndAdd;
804eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->d_FetchAndMax;
805eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->d_FetchAndMin;
806eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *FetchAndOp = link->d_FetchAndMaxloc;
807eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *FetchAndOp = link->d_FetchAndMinloc;
808eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *FetchAndOp = link->d_FetchAndMult;
809eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *FetchAndOp = link->d_FetchAndLAND;
810eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *FetchAndOp = link->d_FetchAndBAND;
811eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *FetchAndOp = link->d_FetchAndLOR;
812eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *FetchAndOp = link->d_FetchAndBOR;
813eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *FetchAndOp = link->d_FetchAndLXOR;
814eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *FetchAndOp = link->d_FetchAndBXOR;
815eb02082bSJunchao Zhang     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
816eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
817eb02082bSJunchao Zhang     if (op == MPIU_REPLACE)                   *FetchAndOp = link->da_FetchAndInsert;
818eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->da_FetchAndAdd;
819eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->da_FetchAndMax;
820eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->da_FetchAndMin;
821eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *FetchAndOp = link->da_FetchAndMaxloc;
822eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *FetchAndOp = link->da_FetchAndMinloc;
823eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *FetchAndOp = link->da_FetchAndMult;
824eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *FetchAndOp = link->da_FetchAndLAND;
825eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *FetchAndOp = link->da_FetchAndBAND;
826eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *FetchAndOp = link->da_FetchAndLOR;
827eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *FetchAndOp = link->da_FetchAndBOR;
828eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *FetchAndOp = link->da_FetchAndLXOR;
829eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *FetchAndOp = link->da_FetchAndBXOR;
830eb02082bSJunchao Zhang     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
831eb02082bSJunchao Zhang   }
832eb02082bSJunchao Zhang #endif
833eb02082bSJunchao Zhang   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);
83440e23c03SJunchao Zhang   PetscFunctionReturn(0);
83540e23c03SJunchao Zhang }
83640e23c03SJunchao Zhang 
83740e23c03SJunchao Zhang /*
838eb02082bSJunchao Zhang   Create pack/unpack optimization plans based on indice patterns available
83940e23c03SJunchao Zhang 
84040e23c03SJunchao Zhang    Input Parameters:
841b23bfdefSJunchao Zhang   +  n       - Number of target ranks
842eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
843b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
84440e23c03SJunchao Zhang 
84540e23c03SJunchao Zhang    Output Parameters:
846eb02082bSJunchao Zhang   +  opt    - Optimization plans. Maybe NULL if no optimization can be built.
84740e23c03SJunchao Zhang */
848eb02082bSJunchao Zhang PetscErrorCode PetscSFPackOptCreate(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
84940e23c03SJunchao Zhang {
85040e23c03SJunchao Zhang   PetscErrorCode ierr;
85140e23c03SJunchao Zhang   PetscInt       i,j,k,n_copies,tot_copies=0,step;
852b23bfdefSJunchao Zhang   PetscBool      strided,optimized=PETSC_FALSE;
85340e23c03SJunchao Zhang   PetscSFPackOpt opt;
85440e23c03SJunchao Zhang 
85540e23c03SJunchao Zhang   PetscFunctionBegin;
856b23bfdefSJunchao Zhang   if (!n) {
857b23bfdefSJunchao Zhang     *out = NULL;
858b23bfdefSJunchao Zhang     PetscFunctionReturn(0);
859b23bfdefSJunchao Zhang   }
860b23bfdefSJunchao Zhang 
86140e23c03SJunchao Zhang   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
862b23bfdefSJunchao Zhang   ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr);
863b23bfdefSJunchao Zhang   ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr);
864b23bfdefSJunchao Zhang   if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/
865b23bfdefSJunchao Zhang 
866b23bfdefSJunchao Zhang   opt->n = n;
86740e23c03SJunchao Zhang 
868*fb61b9e4SStefano Zampini   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with multiple memcpy's ) */
86940e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor */
87040e23c03SJunchao Zhang     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
87140e23c03SJunchao Zhang     n_copies = 1;
87240e23c03SJunchao Zhang     for (j=offset[i]; j<offset[i+1]-1; j++) {
87340e23c03SJunchao Zhang       if (idx[j]+1 != idx[j+1]) n_copies++;
87440e23c03SJunchao Zhang     }
87540e23c03SJunchao Zhang     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
87640e23c03SJunchao Zhang        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
87740e23c03SJunchao Zhang      */
87840e23c03SJunchao Zhang     if ((offset[i+1]-offset[i])/n_copies >= 32) {
879b23bfdefSJunchao Zhang       opt->type[i] = PETSCSF_PACKOPT_MULTICOPY;
880b23bfdefSJunchao Zhang       optimized    = PETSC_TRUE;
88140e23c03SJunchao Zhang       tot_copies  += n_copies;
88240e23c03SJunchao Zhang     }
88340e23c03SJunchao Zhang   }
88440e23c03SJunchao Zhang 
88540e23c03SJunchao Zhang   /* Setup memcpy plan for each contiguous piece */
88640e23c03SJunchao Zhang   k    = 0; /* k-th copy */
887b23bfdefSJunchao Zhang   ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
888b23bfdefSJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor */
889b23bfdefSJunchao Zhang     if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) {
89040e23c03SJunchao Zhang       n_copies           = 1;
891b23bfdefSJunchao Zhang       opt->copy_start[k] = offset[i] - offset[0];
89240e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
89340e23c03SJunchao Zhang         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
89440e23c03SJunchao Zhang           n_copies++;
895b23bfdefSJunchao Zhang           opt->copy_start[k+1] = j-offset[0]+1;
896b23bfdefSJunchao Zhang           opt->copy_length[k]  = opt->copy_start[k+1] - opt->copy_start[k];
89740e23c03SJunchao Zhang           k++;
89840e23c03SJunchao Zhang         }
89940e23c03SJunchao Zhang       }
90040e23c03SJunchao Zhang       /* Set copy length of the last copy for this target */
901b23bfdefSJunchao Zhang       opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k];
90240e23c03SJunchao Zhang       k++;
90340e23c03SJunchao Zhang     }
904b23bfdefSJunchao Zhang     /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */
90540e23c03SJunchao Zhang     opt->copy_offset[i+1] = k;
90640e23c03SJunchao Zhang   }
90740e23c03SJunchao Zhang 
90840e23c03SJunchao Zhang   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
90940e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each remote */
910b23bfdefSJunchao Zhang     if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
91140e23c03SJunchao Zhang       strided = PETSC_TRUE;
91240e23c03SJunchao Zhang       step    = idx[offset[i]+1] - idx[offset[i]];
91340e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
91440e23c03SJunchao Zhang         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
91540e23c03SJunchao Zhang       }
91640e23c03SJunchao Zhang       if (strided) {
917b23bfdefSJunchao Zhang         opt->type[i]         = PETSCSF_PACKOPT_STRIDE;
91840e23c03SJunchao Zhang         opt->stride_step[i]  = step;
91940e23c03SJunchao Zhang         opt->stride_n[i]     = offset[i+1] - offset[i];
920b23bfdefSJunchao Zhang         optimized            = PETSC_TRUE;
92140e23c03SJunchao Zhang       }
92240e23c03SJunchao Zhang     }
92340e23c03SJunchao Zhang   }
924b23bfdefSJunchao Zhang   /* If no rank gets optimized, free arrays to save memory */
925b23bfdefSJunchao Zhang   if (!optimized) {
926b23bfdefSJunchao Zhang     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
927b23bfdefSJunchao Zhang     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
92840e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
92940e23c03SJunchao Zhang     *out = NULL;
93040e23c03SJunchao Zhang   } else *out = opt;
93140e23c03SJunchao Zhang   PetscFunctionReturn(0);
93240e23c03SJunchao Zhang }
93340e23c03SJunchao Zhang 
93464f49babSJed Brown PetscErrorCode PetscSFPackOptDestroy(PetscSFPackOpt *out)
93540e23c03SJunchao Zhang {
93640e23c03SJunchao Zhang   PetscErrorCode ierr;
93740e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
93840e23c03SJunchao Zhang 
93940e23c03SJunchao Zhang   PetscFunctionBegin;
94040e23c03SJunchao Zhang   if (opt) {
941b23bfdefSJunchao Zhang     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
942b23bfdefSJunchao Zhang     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
94340e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
94440e23c03SJunchao Zhang     *out = NULL;
94540e23c03SJunchao Zhang   }
94640e23c03SJunchao Zhang   PetscFunctionReturn(0);
94740e23c03SJunchao Zhang }
948