xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 51ccb202724f7096f70b1178150053ad1fe5b38d)
140e23c03SJunchao Zhang 
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
5eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
6eb02082bSJunchao Zhang #include <cuda_runtime.h>
7eb02082bSJunchao Zhang #endif
840e23c03SJunchao Zhang /*
940e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
1040e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1140e23c03SJunchao Zhang  */
1240e23c03SJunchao Zhang 
13b23bfdefSJunchao Zhang #define CPPJoin2(a,b)         a ##_## b
14b23bfdefSJunchao Zhang #define CPPJoin3(a,b,c)       a ##_## b ##_## c
15b23bfdefSJunchao Zhang #define CPPJoin4_(a,b,c,d)    a##b##_##c##_##d
16b23bfdefSJunchao Zhang #define CPPJoin4(a,b,c,d)     CPPJoin4_(a##_,b,c,d)
1740e23c03SJunchao Zhang 
1840e23c03SJunchao Zhang #define EXECUTE(statement)    statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */
1940e23c03SJunchao Zhang #define IGNORE(statement)     do {} while(0)
2040e23c03SJunchao Zhang 
2140e23c03SJunchao Zhang #define BINARY_OP(r,s,op,t)   do {(r) = (s) op (t);  } while(0)      /* binary ops in the middle such as +, *, && etc. */
2240e23c03SJunchao Zhang #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0)      /* ops like a function, such as PetscMax, PetscMin */
23eb02082bSJunchao Zhang #define LXOR_OP(r,s,op,t)     do {(r) = (!(s)) != (!(t));} while(0)  /* logical exclusive OR */
2440e23c03SJunchao Zhang #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0)
2540e23c03SJunchao Zhang 
26b23bfdefSJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */
2740e23c03SJunchao Zhang 
2840e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
2940e23c03SJunchao Zhang 
3040e23c03SJunchao Zhang    Arguments of the macro:
31b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
32b23bfdefSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bs.
33b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
3440e23c03SJunchao Zhang 
3540e23c03SJunchao Zhang    Arguments of the Pack routine:
36b23bfdefSJunchao Zhang    +count     Number of indices in idx[]
37b23bfdefSJunchao Zhang    .idx       Indices of entries to packed. NULL means contiguous indices, that is [0,count)
38eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39b23bfdefSJunchao Zhang    .opt       Pack optimization plans. NULL means no plan at all.
40b23bfdefSJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count)
41b23bfdefSJunchao Zhang    -packed    Address of the packed data for each rank
4240e23c03SJunchao Zhang  */
43b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \
44eb02082bSJunchao Zhang   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,const void *unpacked,void *packed) \
45b23bfdefSJunchao Zhang   {                                                                                                          \
4640e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
47b23bfdefSJunchao Zhang     const Type     *u = (const Type*)unpacked,*u2;                                                           \
48b23bfdefSJunchao Zhang     Type           *p = (Type*)packed,*p2;                                                                   \
49eb02082bSJunchao Zhang     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
50b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
51b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
5240e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
53b23bfdefSJunchao Zhang     if (!idx) {ierr = PetscArraycpy(p,u,MBS*count);CHKERRQ(ierr);}  /* Indices are contiguous */             \
54b23bfdefSJunchao Zhang     else if (!opt) { /* No optimizations available */                                                        \
55b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
56eb02082bSJunchao Zhang         for (j=0; j<M; j++)     /* Decent compilers should eliminate this loop when M = const 1 */           \
57eb02082bSJunchao Zhang           for (k=0; k<BS; k++)  /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */  \
58b23bfdefSJunchao Zhang             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
59b23bfdefSJunchao Zhang     } else {                                                                                                 \
60b23bfdefSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
61b23bfdefSJunchao Zhang         p2  = p + opt->offset[r]*MBS;                                                                        \
62b23bfdefSJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
63b23bfdefSJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
64b23bfdefSJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
65b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
66b23bfdefSJunchao Zhang               for (k=0; k<BS; k++)                                                                           \
67b23bfdefSJunchao Zhang                 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k];                                                    \
68b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
6940e23c03SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
70b23bfdefSJunchao Zhang             u2   = u + idx[opt->copy_start[i]]*MBS;                                                          \
71b23bfdefSJunchao Zhang             l    = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */                       \
72b23bfdefSJunchao Zhang             ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr);                                                     \
73b23bfdefSJunchao Zhang             p2  += l;                                                                                        \
7440e23c03SJunchao Zhang           }                                                                                                  \
75b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
76b23bfdefSJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
7740e23c03SJunchao Zhang           step = opt->stride_step[r];                                                                        \
7840e23c03SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
79b23bfdefSJunchao Zhang             for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j];                                            \
80b23bfdefSJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
8140e23c03SJunchao Zhang       }                                                                                                      \
8240e23c03SJunchao Zhang     }                                                                                                        \
8340e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
8440e23c03SJunchao Zhang   }
8540e23c03SJunchao Zhang 
8640e23c03SJunchao Zhang /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine
8740e23c03SJunchao Zhang 
8840e23c03SJunchao Zhang    Arguments:
8940e23c03SJunchao Zhang   +action     Unpack or Fetch
90b23bfdefSJunchao Zhang   .Type       Type of the data
9140e23c03SJunchao Zhang   .BS         Block size for vectorization
92b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
9340e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
94b23bfdefSJunchao Zhang   .CType      Type with or without the const qualifier, i.e., const Type or Type
95b23bfdefSJunchao Zhang   .Cvoid      void with or without the const qualifier, i.e., const void or void
9640e23c03SJunchao Zhang 
9740e23c03SJunchao Zhang   Notes:
9840e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
99b23bfdefSJunchao Zhang    The two arguments CType and Cvoid are used (instead of one constness argument), because we want to
10040e23c03SJunchao Zhang    get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument,
10140e23c03SJunchao Zhang    sometimes we input 'const', sometimes we have to input empty.
102b23bfdefSJunchao Zhang 
103b23bfdefSJunchao Zhang    If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare.
10440e23c03SJunchao Zhang  */
105b23bfdefSJunchao Zhang #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid)               \
106eb02082bSJunchao Zhang   static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
107b23bfdefSJunchao Zhang   {                                                                                                          \
10840e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
109b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
110b23bfdefSJunchao Zhang     CType          *p = (CType*)packed,*p2;                                                                  \
111eb02082bSJunchao Zhang     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
112b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
113b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
11440e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
115b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
116b23bfdefSJunchao Zhang       FILTER(Type *v);                                                                                       \
117b23bfdefSJunchao Zhang       FILTER(ierr = PetscMalloc1(count*MBS,&v);CHKERRQ(ierr));                                               \
118b23bfdefSJunchao Zhang       FILTER(ierr = PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr));                                             \
119b23bfdefSJunchao Zhang              ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);                                              \
120b23bfdefSJunchao Zhang       FILTER(ierr = PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr));                                             \
12140e23c03SJunchao Zhang       FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                             \
122b23bfdefSJunchao Zhang     } else if (!opt) { /* No optimizations available */                                                      \
123b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
124b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
125b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
126b23bfdefSJunchao Zhang             FILTER(Type t                = u[idx[i]*MBS+j*BS+k]);                                            \
127b23bfdefSJunchao Zhang                    u[idx[i]*MBS+j*BS+k]  = p[i*MBS+j*BS+k];                                                  \
128b23bfdefSJunchao Zhang             FILTER(p[i*MBS+j*BS+k]       = t);                                                               \
12940e23c03SJunchao Zhang           }                                                                                                  \
130b23bfdefSJunchao Zhang     } else {                                                                                                 \
131b23bfdefSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
132b23bfdefSJunchao Zhang         p2 = p + opt->offset[r]*MBS;                                                                         \
133b23bfdefSJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
134b23bfdefSJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
135b23bfdefSJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
136b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
137b23bfdefSJunchao Zhang               for (k=0; k<BS; k++) {                                                                         \
138b23bfdefSJunchao Zhang                 FILTER(Type t                = u[idx2[i]*MBS+j*BS+k]);                                       \
139b23bfdefSJunchao Zhang                        u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
140b23bfdefSJunchao Zhang                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
14140e23c03SJunchao Zhang               }                                                                                              \
142b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
143b23bfdefSJunchao Zhang           FILTER(Type *v);                                                                                   \
144b23bfdefSJunchao Zhang           FILTER(ierr = PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \
14540e23c03SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
146b23bfdefSJunchao Zhang             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
147b23bfdefSJunchao Zhang             l  = opt->copy_length[i]*MBS;                                                                    \
148da2e4c71SJunchao Zhang             FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr));                                              \
149b23bfdefSJunchao Zhang                    ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr);                                              \
150b23bfdefSJunchao Zhang             FILTER(ierr = PetscArraycpy(p2,v,l);CHKERRQ(ierr));                                              \
151b23bfdefSJunchao Zhang             p2 += l;                                                                                         \
15240e23c03SJunchao Zhang           }                                                                                                  \
15340e23c03SJunchao Zhang           FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                         \
154b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
155b23bfdefSJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
15640e23c03SJunchao Zhang           step = opt->stride_step[r];                                                                        \
15740e23c03SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
158b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
159b23bfdefSJunchao Zhang               for (k=0; k<BS; k++) {                                                                         \
160b23bfdefSJunchao Zhang                 FILTER(Type t                = u2[i*step*MBS+j*BS+k]);                                       \
161b23bfdefSJunchao Zhang                        u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
162b23bfdefSJunchao Zhang                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
16340e23c03SJunchao Zhang               }                                                                                              \
164b23bfdefSJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
16540e23c03SJunchao Zhang       }                                                                                                      \
16640e23c03SJunchao Zhang     }                                                                                                        \
16740e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
16840e23c03SJunchao Zhang   }
16940e23c03SJunchao Zhang 
17040e23c03SJunchao Zhang /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc
17140e23c03SJunchao Zhang 
17240e23c03SJunchao Zhang    Arguments:
17340e23c03SJunchao Zhang   +action     Unpack or Fetch
17440e23c03SJunchao Zhang   .opname     Name of the Op, such as Add, Mult, LAND, etc.
175b23bfdefSJunchao Zhang   .Type       Type of the data
17640e23c03SJunchao Zhang   .BS         Block size for vectorization
177b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
17840e23c03SJunchao Zhang   .op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
17940e23c03SJunchao Zhang   .APPLY      Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP
18040e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
181b23bfdefSJunchao Zhang   .CType      Type with or without the const qualifier, i.e., const Type or Type
182b23bfdefSJunchao Zhang   -Cvoid      void with or without the const qualifier, i.e., const void or void
18340e23c03SJunchao Zhang  */
184b23bfdefSJunchao Zhang #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \
185eb02082bSJunchao Zhang   static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
186b23bfdefSJunchao Zhang   {                                                                                                          \
187b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2,t;                                                               \
188b23bfdefSJunchao Zhang     CType          *p = (CType*)packed,*p2;                                                                  \
189eb02082bSJunchao Zhang     PetscInt       i,j,k,l,r,step,bs=link->bs;                                                               \
190b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
191b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
19240e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
193b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
194b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
195b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
196b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
197b23bfdefSJunchao Zhang             t    = u[i*MBS+j*BS+k];                                                                          \
198b23bfdefSJunchao Zhang             APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                                    \
199b23bfdefSJunchao Zhang             FILTER(p[i*MBS+j*BS+k] = t);                                                                     \
20040e23c03SJunchao Zhang           }                                                                                                  \
201b23bfdefSJunchao Zhang     } else if (!opt) { /* No optimizations available */                                                      \
202b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
203b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
204b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
205b23bfdefSJunchao Zhang               t    = u[idx[i]*MBS+j*BS+k];                                                                   \
206b23bfdefSJunchao Zhang               APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                             \
207b23bfdefSJunchao Zhang               FILTER(p[i*MBS+j*BS+k] = t);                                                                   \
20840e23c03SJunchao Zhang           }                                                                                                  \
209b23bfdefSJunchao Zhang     } else {                                                                                                 \
210b23bfdefSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
211b23bfdefSJunchao Zhang         p2 = p + opt->offset[r]*MBS;                                                                         \
212b23bfdefSJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
213b23bfdefSJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
214b23bfdefSJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
215b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
216b23bfdefSJunchao Zhang               for (k=0; k<BS; k++) {                                                                         \
217b23bfdefSJunchao Zhang                 t    = u[idx2[i]*MBS+j*BS+k];                                                                \
218b23bfdefSJunchao Zhang                 APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
219b23bfdefSJunchao Zhang                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
22040e23c03SJunchao Zhang               }                                                                                              \
221b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
22240e23c03SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
223b23bfdefSJunchao Zhang             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
224b23bfdefSJunchao Zhang             l  = opt->copy_length[i]*MBS;                                                                    \
22540e23c03SJunchao Zhang             for (j=0; j<l; j++) {                                                                            \
22640e23c03SJunchao Zhang               t    = u2[j];                                                                                  \
227b23bfdefSJunchao Zhang               APPLY (u2[j],t,op,p2[j]);                                                                      \
228b23bfdefSJunchao Zhang               FILTER(p2[j] = t);                                                                             \
22940e23c03SJunchao Zhang             }                                                                                                \
230b23bfdefSJunchao Zhang             p2 += l;                                                                                         \
23140e23c03SJunchao Zhang           }                                                                                                  \
232b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
233b23bfdefSJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
23440e23c03SJunchao Zhang           step = opt->stride_step[r];                                                                        \
23540e23c03SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
236b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
237b23bfdefSJunchao Zhang               for (k=0; k<BS; k++) {                                                                         \
238b23bfdefSJunchao Zhang                 t    = u2[i*step*MBS+j*BS+k];                                                                \
239b23bfdefSJunchao Zhang                 APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
240b23bfdefSJunchao Zhang                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
24140e23c03SJunchao Zhang               }                                                                                              \
242b23bfdefSJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
24340e23c03SJunchao Zhang       }                                                                                                      \
24440e23c03SJunchao Zhang     }                                                                                                        \
24540e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
24640e23c03SJunchao Zhang   }
24740e23c03SJunchao Zhang 
24840e23c03SJunchao Zhang /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine
24940e23c03SJunchao Zhang 
25040e23c03SJunchao Zhang    Arguments:
25140e23c03SJunchao Zhang   +Action     Unpack or Fetch
25240e23c03SJunchao Zhang   .locname    Max or Min
25340e23c03SJunchao Zhang   .type1      Type of the first data in a pair type
25440e23c03SJunchao Zhang   .type2      Type of the second data in a pair type, usually PetscMPIInt for MPI ranks.
25540e23c03SJunchao Zhang   .op         > or <
25640e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
257b23bfdefSJunchao Zhang   .CType      Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2)
258b23bfdefSJunchao Zhang   -Cvoid      void with or without the const qualifier, i.e., const void or void
25940e23c03SJunchao Zhang  */
260b23bfdefSJunchao Zhang #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \
261eb02082bSJunchao Zhang   static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \
262b23bfdefSJunchao Zhang     PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked;                                             \
263b23bfdefSJunchao Zhang     CType                 *p = (CType*)packed;                                                               \
264b23bfdefSJunchao Zhang     PetscInt              i,j;                                                                               \
265b23bfdefSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
266eb02082bSJunchao Zhang       FILTER(PairType(Type1,Type2) v);                                                                       \
267b23bfdefSJunchao Zhang       j = idx? idx[i] : i;                                                                                   \
268eb02082bSJunchao Zhang       FILTER(v = u[j]);                                                                                      \
26940e23c03SJunchao Zhang       if (p[i].a op u[j].a) {                                                                                \
27040e23c03SJunchao Zhang         u[j] = p[i];                                                                                         \
27140e23c03SJunchao Zhang       } else if (p[i].a == u[j].a) {                                                                         \
27240e23c03SJunchao Zhang         u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */                                 \
27340e23c03SJunchao Zhang       }                                                                                                      \
27440e23c03SJunchao Zhang       FILTER(p[i] = v);                                                                                      \
27540e23c03SJunchao Zhang     }                                                                                                        \
27640e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
27740e23c03SJunchao Zhang   }
27840e23c03SJunchao Zhang 
279b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
280b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ)                                                                   \
281b23bfdefSJunchao Zhang   DEF_PackFunc(Type,BS,EQ)                                                                     \
282b23bfdefSJunchao Zhang   DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void)                                   \
283b23bfdefSJunchao Zhang   DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void)                                              \
284b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) {                           \
285eb02082bSJunchao Zhang     link->h_Pack            = CPPJoin4(Pack,           Type,BS,EQ);                            \
286eb02082bSJunchao Zhang     link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);                            \
287eb02082bSJunchao Zhang     link->h_FetchAndInsert  = CPPJoin4(FetchAndInsert, Type,BS,EQ);                            \
28840e23c03SJunchao Zhang   }
28940e23c03SJunchao Zhang 
290b23bfdefSJunchao Zhang /* Add, Mult ops */
291b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ)                                                                    \
292b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void)             \
293b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void)             \
294b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void)                        \
295b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void)                        \
296b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) {                            \
297eb02082bSJunchao Zhang     link->h_UnpackAndAdd    = CPPJoin4(UnpackAndAdd,   Type,BS,EQ);                            \
298eb02082bSJunchao Zhang     link->h_UnpackAndMult   = CPPJoin4(UnpackAndMult,  Type,BS,EQ);                            \
299eb02082bSJunchao Zhang     link->h_FetchAndAdd     = CPPJoin4(FetchAndAdd,    Type,BS,EQ);                            \
300eb02082bSJunchao Zhang     link->h_FetchAndMult    = CPPJoin4(FetchAndMult,   Type,BS,EQ);                            \
30140e23c03SJunchao Zhang   }
30240e23c03SJunchao Zhang 
303b23bfdefSJunchao Zhang /* Max, Min ops */
304b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ)                                                                    \
305b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void)     \
306b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void)     \
307b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void)                \
308b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void)                \
309b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) {                        \
310eb02082bSJunchao Zhang     link->h_UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);                            \
311eb02082bSJunchao Zhang     link->h_UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);                            \
312eb02082bSJunchao Zhang     link->h_FetchAndMax     = CPPJoin4(FetchAndMax ,   Type,BS,EQ);                            \
313eb02082bSJunchao Zhang     link->h_FetchAndMin     = CPPJoin4(FetchAndMin ,   Type,BS,EQ);                            \
314b23bfdefSJunchao Zhang   }
315b23bfdefSJunchao Zhang 
316b23bfdefSJunchao Zhang /* Logical ops.
317b23bfdefSJunchao Zhang   The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid
31840e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
31940e23c03SJunchao Zhang  */
320b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ)                                                                    \
321b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void)            \
322b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void)            \
323b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP,  IGNORE,const Type,const void)            \
324b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void)                       \
325b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void)                       \
326b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP,  EXECUTE,Type,void)                       \
327b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) {                        \
328eb02082bSJunchao Zhang     link->h_UnpackAndLAND   = CPPJoin4(UnpackAndLAND,Type,BS,EQ);                              \
329eb02082bSJunchao Zhang     link->h_UnpackAndLOR    = CPPJoin4(UnpackAndLOR, Type,BS,EQ);                              \
330eb02082bSJunchao Zhang     link->h_UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR,Type,BS,EQ);                              \
331eb02082bSJunchao Zhang     link->h_FetchAndLAND    = CPPJoin4(FetchAndLAND, Type,BS,EQ);                              \
332eb02082bSJunchao Zhang     link->h_FetchAndLOR     = CPPJoin4(FetchAndLOR,  Type,BS,EQ);                              \
333eb02082bSJunchao Zhang     link->h_FetchAndLXOR    = CPPJoin4(FetchAndLXOR, Type,BS,EQ);                              \
33440e23c03SJunchao Zhang   }
33540e23c03SJunchao Zhang 
336b23bfdefSJunchao Zhang /* Bitwise ops */
337b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ)                                                                    \
338b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void)             \
339b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void)             \
340b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void)             \
341b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void)                        \
342b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void)                        \
343b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void)                        \
344b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) {                        \
345eb02082bSJunchao Zhang     link->h_UnpackAndBAND   = CPPJoin4(UnpackAndBAND,Type,BS,EQ);                              \
346eb02082bSJunchao Zhang     link->h_UnpackAndBOR    = CPPJoin4(UnpackAndBOR, Type,BS,EQ);                              \
347eb02082bSJunchao Zhang     link->h_UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR,Type,BS,EQ);                              \
348eb02082bSJunchao Zhang     link->h_FetchAndBAND    = CPPJoin4(FetchAndBAND, Type,BS,EQ);                              \
349eb02082bSJunchao Zhang     link->h_FetchAndBOR     = CPPJoin4(FetchAndBOR,  Type,BS,EQ);                              \
350eb02082bSJunchao Zhang     link->h_FetchAndBXOR    = CPPJoin4(FetchAndBXOR, Type,BS,EQ);                              \
35140e23c03SJunchao Zhang   }
35240e23c03SJunchao Zhang 
353b23bfdefSJunchao Zhang /* Maxloc, Minloc */
354b23bfdefSJunchao Zhang #define DEF_Xloc(Type1,Type2)                                                                  \
355b23bfdefSJunchao Zhang   DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void)    \
356b23bfdefSJunchao Zhang   DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void)    \
357b23bfdefSJunchao Zhang   DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void)               \
358b23bfdefSJunchao Zhang   DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void)               \
359b23bfdefSJunchao Zhang   static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) {                          \
360eb02082bSJunchao Zhang     link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1);             \
361eb02082bSJunchao Zhang     link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1);             \
362eb02082bSJunchao Zhang     link->h_FetchAndMaxloc  = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1);             \
363eb02082bSJunchao Zhang     link->h_FetchAndMinloc  = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1);             \
36440e23c03SJunchao Zhang   }
36540e23c03SJunchao Zhang 
366b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ)                                                            \
367b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                                         \
368b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                                          \
369b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                                          \
370b23bfdefSJunchao Zhang   DEF_Log(Type,BS,EQ)                                                                          \
371b23bfdefSJunchao Zhang   DEF_Bit(Type,BS,EQ)                                                                          \
372b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) {                    \
373b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
374b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
375b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
376b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                               \
377b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                               \
37840e23c03SJunchao Zhang   }
37940e23c03SJunchao Zhang 
380b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ)                                                               \
381b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                                         \
382b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                                          \
383b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                                          \
384b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) {                       \
385b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
386b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
387b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
388b23bfdefSJunchao Zhang   }
38940e23c03SJunchao Zhang 
39040e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
391b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ)                                                            \
392b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                                         \
393b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                                          \
394b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) {                    \
395b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
396b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
397b23bfdefSJunchao Zhang   }
39840e23c03SJunchao Zhang #endif
399b23bfdefSJunchao Zhang 
400b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ)                                                               \
401b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                                         \
402b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) {                       \
403b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
404b23bfdefSJunchao Zhang   }
405b23bfdefSJunchao Zhang 
406b23bfdefSJunchao Zhang /* Maxloc, Minloc */
407b23bfdefSJunchao Zhang #define DEF_PairType(Type1,Type2)                                                              \
408b23bfdefSJunchao Zhang   typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2);                                    \
409b23bfdefSJunchao Zhang   DEF_Pack(PairType(Type1,Type2),1,1)                                                          \
410b23bfdefSJunchao Zhang   DEF_Xloc(Type1,Type2)                                                                        \
411b23bfdefSJunchao Zhang   static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) {                      \
412b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link);                                   \
413b23bfdefSJunchao Zhang     CPPJoin3(PackInit_Xloc,Type1,Type2)(link);                                                 \
414b23bfdefSJunchao Zhang   }
415b23bfdefSJunchao Zhang 
416b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
417b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
418b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
419b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
420b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
421b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
422b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
423b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
424b23bfdefSJunchao Zhang 
425b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
426b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1)
427b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1)
428b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1)
429b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1)
430b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0)
431b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0)
432b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0)
433b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0)
434b23bfdefSJunchao Zhang #endif
435b23bfdefSJunchao Zhang 
436b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */
437b23bfdefSJunchao Zhang typedef signed char SignedChar;
438b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1)
439b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1)
440b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1)
441b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1)
442b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0)
443b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0)
444b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0)
445b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0)
446b23bfdefSJunchao Zhang 
447b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar;
448b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1)
449b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1)
450b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1)
451b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1)
452b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0)
453b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0)
454b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0)
455b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0)
456b23bfdefSJunchao Zhang 
457b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1)
458b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1)
459b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1)
460b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1)
461b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0)
462b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0)
463b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0)
464b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0)
465b23bfdefSJunchao Zhang 
466b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
467b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1)
468b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1)
469b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1)
470b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1)
471b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0)
472b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0)
473b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0)
474b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0)
475b23bfdefSJunchao Zhang #endif
476b23bfdefSJunchao Zhang 
477b23bfdefSJunchao Zhang DEF_PairType(int,int)
478b23bfdefSJunchao Zhang DEF_PairType(PetscInt,PetscInt)
479b23bfdefSJunchao Zhang 
480b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */
481b23bfdefSJunchao Zhang DEF_DumbType(char,1,1)
482b23bfdefSJunchao Zhang DEF_DumbType(char,2,1)
483b23bfdefSJunchao Zhang DEF_DumbType(char,4,1)
484b23bfdefSJunchao Zhang DEF_DumbType(char,1,0)
485b23bfdefSJunchao Zhang DEF_DumbType(char,2,0)
486b23bfdefSJunchao Zhang DEF_DumbType(char,4,0)
487b23bfdefSJunchao Zhang 
488eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
489b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1)
490b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1)
491b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1)
492b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1)
493b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0)
494b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0)
495b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0)
496b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0)
49740e23c03SJunchao Zhang 
49840e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
49940e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
50040e23c03SJunchao Zhang {
50140e23c03SJunchao Zhang   int ierr;
50240e23c03SJunchao Zhang   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
50340e23c03SJunchao Zhang   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
50440e23c03SJunchao Zhang   return MPI_SUCCESS;
50540e23c03SJunchao Zhang }
50640e23c03SJunchao Zhang #endif
50740e23c03SJunchao Zhang 
508637e6665SJunchao Zhang PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFPack *mylink)
50940e23c03SJunchao Zhang {
51040e23c03SJunchao Zhang   PetscErrorCode    ierr;
51140e23c03SJunchao Zhang   PetscSFPack       link,*p;
51240e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
51340e23c03SJunchao Zhang 
51440e23c03SJunchao Zhang   PetscFunctionBegin;
51540e23c03SJunchao Zhang   /* Look for types in cache */
51640e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
51740e23c03SJunchao Zhang     PetscBool match;
51840e23c03SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
519637e6665SJunchao Zhang     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
52040e23c03SJunchao Zhang       switch (cmode) {
52140e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
52240e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
52340e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
52440e23c03SJunchao Zhang       }
52540e23c03SJunchao Zhang       *mylink = link;
52640e23c03SJunchao Zhang       PetscFunctionReturn(0);
52740e23c03SJunchao Zhang     }
52840e23c03SJunchao Zhang   }
52940e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
53040e23c03SJunchao Zhang   PetscFunctionReturn(0);
53140e23c03SJunchao Zhang }
53240e23c03SJunchao Zhang 
53340e23c03SJunchao Zhang PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link)
53440e23c03SJunchao Zhang {
53540e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
53640e23c03SJunchao Zhang 
53740e23c03SJunchao Zhang   PetscFunctionBegin;
538637e6665SJunchao Zhang   (*link)->rootdata = NULL;
539637e6665SJunchao Zhang   (*link)->leafdata = NULL;
54040e23c03SJunchao Zhang   (*link)->next     = bas->avail;
54140e23c03SJunchao Zhang   bas->avail        = *link;
54240e23c03SJunchao Zhang   *link             = NULL;
54340e23c03SJunchao Zhang   PetscFunctionReturn(0);
54440e23c03SJunchao Zhang }
54540e23c03SJunchao Zhang 
546eb02082bSJunchao Zhang /* Destroy all links, i.e., PetscSFPacks in the linked list, usually named 'avail' */
547*51ccb202SJunchao Zhang PetscErrorCode PetscSFPackDestroyAvailable(PetscSF sf,PetscSFPack *avail)
548eb02082bSJunchao Zhang {
549eb02082bSJunchao Zhang   PetscErrorCode    ierr;
550eb02082bSJunchao Zhang   PetscSFPack       link=*avail,next;
551eb02082bSJunchao Zhang   PetscInt          i;
552eb02082bSJunchao Zhang 
553eb02082bSJunchao Zhang   PetscFunctionBegin;
554eb02082bSJunchao Zhang   for (; link; link=next) {
555eb02082bSJunchao Zhang     next = link->next;
556eb02082bSJunchao Zhang     if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);}
557eb02082bSJunchao Zhang     for (i=0; i<(link->nrootreqs+link->nleafreqs)*4; i++) { /* Persistent reqs must be freed. */
558eb02082bSJunchao Zhang       if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);}
559eb02082bSJunchao Zhang     }
560eb02082bSJunchao Zhang     ierr = PetscFree(link->reqs);CHKERRQ(ierr);
561*51ccb202SJunchao Zhang 
562*51ccb202SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
563*51ccb202SJunchao Zhang     if (!use_gpu_aware_mpi && sf->use_pinned_buf) { /* In case the buffers are allocated specially */
564*51ccb202SJunchao Zhang       if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscFreePinnedMemory(link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);}
565*51ccb202SJunchao Zhang       if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscFreePinnedMemory(link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);}
566*51ccb202SJunchao Zhang     }
567*51ccb202SJunchao Zhang #endif
568*51ccb202SJunchao Zhang 
569eb02082bSJunchao Zhang     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
570eb02082bSJunchao Zhang     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
571eb02082bSJunchao Zhang     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->selfbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr);
572eb02082bSJunchao Zhang 
573eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
574eb02082bSJunchao Zhang     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
575eb02082bSJunchao Zhang     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
576eb02082bSJunchao Zhang     ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->selfbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
577eb02082bSJunchao Zhang     if (link->stream) {cudaError_t err =  cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
578eb02082bSJunchao Zhang #endif
579eb02082bSJunchao Zhang     ierr = PetscFree(link);CHKERRQ(ierr);
580eb02082bSJunchao Zhang   }
581eb02082bSJunchao Zhang   *avail = NULL;
582eb02082bSJunchao Zhang   PetscFunctionReturn(0);
583eb02082bSJunchao Zhang }
584eb02082bSJunchao Zhang 
5859d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
586637e6665SJunchao Zhang PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
5879d1c8addSJunchao Zhang {
5889d1c8addSJunchao Zhang   PetscErrorCode    ierr;
5899d1c8addSJunchao Zhang   PetscSFPack       link,*p;
5909d1c8addSJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
5919d1c8addSJunchao Zhang   PetscBool         match;
5929d1c8addSJunchao Zhang 
5939d1c8addSJunchao Zhang   PetscFunctionBegin;
59418fb5014SJunchao Zhang   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
59518fb5014SJunchao Zhang      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
59618fb5014SJunchao Zhang   */
597637e6665SJunchao Zhang   if (rootdata || leafdata) {
5989d1c8addSJunchao Zhang     for (p=&bas->inuse; (link=*p); p=&link->next) {
5999d1c8addSJunchao Zhang       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
600637e6665SJunchao Zhang       if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rootdata,leafdata);
6019d1c8addSJunchao Zhang     }
60218fb5014SJunchao Zhang   }
6039d1c8addSJunchao Zhang   PetscFunctionReturn(0);
6049d1c8addSJunchao Zhang }
6059d1c8addSJunchao Zhang 
606eb02082bSJunchao Zhang PetscErrorCode PetscSFPackSetUp_Host(PetscSF sf,PetscSFPack link,MPI_Datatype unit)
60740e23c03SJunchao Zhang {
60840e23c03SJunchao Zhang   PetscErrorCode ierr;
609b23bfdefSJunchao Zhang   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
610b23bfdefSJunchao Zhang   PetscBool      is2Int,is2PetscInt;
61140e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
61240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
613b23bfdefSJunchao Zhang   PetscInt       nPetscComplex=0;
61440e23c03SJunchao Zhang #endif
61540e23c03SJunchao Zhang 
61640e23c03SJunchao Zhang   PetscFunctionBegin;
617b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
618b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
619b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
620b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
621b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
622b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
62340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
624b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
62540e23c03SJunchao Zhang #endif
62640e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
62740e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
628b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
62940e23c03SJunchao Zhang   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
6305ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
631b23bfdefSJunchao Zhang   link->bs = 1; /* default */
63240e23c03SJunchao Zhang 
633eb02082bSJunchao Zhang   if (is2Int) {
634eb02082bSJunchao Zhang     PackInit_PairType_int_int(link);
635eb02082bSJunchao Zhang     link->bs        = 1;
636eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(int);
6375ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
638eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
6395ad15460SJunchao Zhang     link->unit      = MPI_2INT;
640eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
641eb02082bSJunchao Zhang     PackInit_PairType_PetscInt_PetscInt(link);
642eb02082bSJunchao Zhang     link->bs        = 1;
643eb02082bSJunchao Zhang     link->unitbytes = 2*sizeof(PetscInt);
644eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
6455ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
6465ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
647eb02082bSJunchao Zhang   } else if (nPetscReal) {
648b23bfdefSJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
649b23bfdefSJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
650b23bfdefSJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
651b23bfdefSJunchao Zhang     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
652b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
653eb02082bSJunchao Zhang     link->unitbytes = nPetscReal*sizeof(PetscReal);
65440e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
6555ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
656b23bfdefSJunchao Zhang   } else if (nPetscInt) {
657b23bfdefSJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
658b23bfdefSJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
659b23bfdefSJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
660b23bfdefSJunchao Zhang     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
661b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
662eb02082bSJunchao Zhang     link->unitbytes = nPetscInt*sizeof(PetscInt);
663b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
6645ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
665b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
666b23bfdefSJunchao Zhang   } else if (nInt) {
667b23bfdefSJunchao Zhang     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
668b23bfdefSJunchao Zhang     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
669b23bfdefSJunchao Zhang     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
670b23bfdefSJunchao Zhang     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
671b23bfdefSJunchao Zhang     link->bs        = nInt;
672eb02082bSJunchao Zhang     link->unitbytes = nInt*sizeof(int);
673b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
6745ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
675b23bfdefSJunchao Zhang #endif
676b23bfdefSJunchao Zhang   } else if (nSignedChar) {
677b23bfdefSJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
678b23bfdefSJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
679b23bfdefSJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
680b23bfdefSJunchao Zhang     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
681b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
682eb02082bSJunchao Zhang     link->unitbytes = nSignedChar*sizeof(SignedChar);
683b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
6845ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
685b23bfdefSJunchao Zhang   }  else if (nUnsignedChar) {
686b23bfdefSJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
687b23bfdefSJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
688b23bfdefSJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
689b23bfdefSJunchao Zhang     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
690b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
691eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
692b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
6935ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
69440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
695b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
696b23bfdefSJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
697b23bfdefSJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
698b23bfdefSJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
699b23bfdefSJunchao Zhang     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
700b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
701eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex*sizeof(PetscComplex);
70240e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
7035ad15460SJunchao Zhang     if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
70440e23c03SJunchao Zhang #endif
70540e23c03SJunchao Zhang   } else {
706b23bfdefSJunchao Zhang     MPI_Aint lb,nbyte;
707b23bfdefSJunchao Zhang     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
70840e23c03SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
709eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
710eb02082bSJunchao Zhang       if      (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
711eb02082bSJunchao Zhang       else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
712eb02082bSJunchao Zhang       else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
713eb02082bSJunchao Zhang       link->bs        = nbyte;
714b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
715b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
71640e23c03SJunchao Zhang     } else {
717eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
718eb02082bSJunchao Zhang       if      (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
719eb02082bSJunchao Zhang       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
720eb02082bSJunchao Zhang       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
721eb02082bSJunchao Zhang       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
722eb02082bSJunchao Zhang       link->bs        = nInt;
723b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
72440e23c03SJunchao Zhang       link->basicunit = MPI_INT;
72540e23c03SJunchao Zhang     }
7265ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
72740e23c03SJunchao Zhang   }
728b23bfdefSJunchao Zhang 
7295ad15460SJunchao Zhang   if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
73040e23c03SJunchao Zhang   PetscFunctionReturn(0);
73140e23c03SJunchao Zhang }
73240e23c03SJunchao Zhang 
733eb02082bSJunchao Zhang PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*))
73440e23c03SJunchao Zhang {
73540e23c03SJunchao Zhang   PetscFunctionBegin;
73640e23c03SJunchao Zhang   *UnpackAndOp = NULL;
737eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
738eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->h_UnpackAndInsert;
739eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
740eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->h_UnpackAndMult;
741eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
742eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
743eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->h_UnpackAndLAND;
744eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->h_UnpackAndBAND;
745eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->h_UnpackAndLOR;
746eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->h_UnpackAndBOR;
747eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->h_UnpackAndLXOR;
748eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->h_UnpackAndBXOR;
749eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->h_UnpackAndMaxloc;
750eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->h_UnpackAndMinloc;
751eb02082bSJunchao Zhang   }
752eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
753eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
754eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->d_UnpackAndInsert;
755eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
756eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->d_UnpackAndMult;
757eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
758eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
759eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->d_UnpackAndLAND;
760eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->d_UnpackAndBAND;
761eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->d_UnpackAndLOR;
762eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->d_UnpackAndBOR;
763eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->d_UnpackAndLXOR;
764eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->d_UnpackAndBXOR;
765eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->d_UnpackAndMaxloc;
766eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->d_UnpackAndMinloc;
767eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
768eb02082bSJunchao Zhang     if      (op == MPIU_REPLACE)              *UnpackAndOp = link->da_UnpackAndInsert;
769eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
770eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *UnpackAndOp = link->da_UnpackAndMult;
771eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
772eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
773eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *UnpackAndOp = link->da_UnpackAndLAND;
774eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *UnpackAndOp = link->da_UnpackAndBAND;
775eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *UnpackAndOp = link->da_UnpackAndLOR;
776eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *UnpackAndOp = link->da_UnpackAndBOR;
777eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *UnpackAndOp = link->da_UnpackAndLXOR;
778eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *UnpackAndOp = link->da_UnpackAndBXOR;
779eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *UnpackAndOp = link->da_UnpackAndMaxloc;
780eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *UnpackAndOp = link->da_UnpackAndMinloc;
781eb02082bSJunchao Zhang   }
782eb02082bSJunchao Zhang #endif
783eb02082bSJunchao Zhang   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);
784eb02082bSJunchao Zhang 
78540e23c03SJunchao Zhang   PetscFunctionReturn(0);
78640e23c03SJunchao Zhang }
78740e23c03SJunchao Zhang 
788eb02082bSJunchao Zhang PetscErrorCode PetscSFPackGetFetchAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*))
78940e23c03SJunchao Zhang {
79040e23c03SJunchao Zhang   PetscFunctionBegin;
79140e23c03SJunchao Zhang   *FetchAndOp = NULL;
792eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) {
793eb02082bSJunchao Zhang     if (op == MPIU_REPLACE)                   *FetchAndOp = link->h_FetchAndInsert;
794eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->h_FetchAndAdd;
795eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->h_FetchAndMax;
796eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->h_FetchAndMin;
797eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *FetchAndOp = link->h_FetchAndMaxloc;
798eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *FetchAndOp = link->h_FetchAndMinloc;
799eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *FetchAndOp = link->h_FetchAndMult;
800eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *FetchAndOp = link->h_FetchAndLAND;
801eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *FetchAndOp = link->h_FetchAndBAND;
802eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *FetchAndOp = link->h_FetchAndLOR;
803eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *FetchAndOp = link->h_FetchAndBOR;
804eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *FetchAndOp = link->h_FetchAndLXOR;
805eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *FetchAndOp = link->h_FetchAndBXOR;
806eb02082bSJunchao Zhang     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
807eb02082bSJunchao Zhang   }
808eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
809eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
810eb02082bSJunchao Zhang     if (op == MPIU_REPLACE)                   *FetchAndOp = link->d_FetchAndInsert;
811eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->d_FetchAndAdd;
812eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->d_FetchAndMax;
813eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->d_FetchAndMin;
814eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *FetchAndOp = link->d_FetchAndMaxloc;
815eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *FetchAndOp = link->d_FetchAndMinloc;
816eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *FetchAndOp = link->d_FetchAndMult;
817eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *FetchAndOp = link->d_FetchAndLAND;
818eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *FetchAndOp = link->d_FetchAndBAND;
819eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *FetchAndOp = link->d_FetchAndLOR;
820eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *FetchAndOp = link->d_FetchAndBOR;
821eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *FetchAndOp = link->d_FetchAndLXOR;
822eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *FetchAndOp = link->d_FetchAndBXOR;
823eb02082bSJunchao Zhang     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
824eb02082bSJunchao Zhang   } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
825eb02082bSJunchao Zhang     if (op == MPIU_REPLACE)                   *FetchAndOp = link->da_FetchAndInsert;
826eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->da_FetchAndAdd;
827eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->da_FetchAndMax;
828eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->da_FetchAndMin;
829eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC)                *FetchAndOp = link->da_FetchAndMaxloc;
830eb02082bSJunchao Zhang     else if (op == MPI_MINLOC)                *FetchAndOp = link->da_FetchAndMinloc;
831eb02082bSJunchao Zhang     else if (op == MPI_PROD)                  *FetchAndOp = link->da_FetchAndMult;
832eb02082bSJunchao Zhang     else if (op == MPI_LAND)                  *FetchAndOp = link->da_FetchAndLAND;
833eb02082bSJunchao Zhang     else if (op == MPI_BAND)                  *FetchAndOp = link->da_FetchAndBAND;
834eb02082bSJunchao Zhang     else if (op == MPI_LOR)                   *FetchAndOp = link->da_FetchAndLOR;
835eb02082bSJunchao Zhang     else if (op == MPI_BOR)                   *FetchAndOp = link->da_FetchAndBOR;
836eb02082bSJunchao Zhang     else if (op == MPI_LXOR)                  *FetchAndOp = link->da_FetchAndLXOR;
837eb02082bSJunchao Zhang     else if (op == MPI_BXOR)                  *FetchAndOp = link->da_FetchAndBXOR;
838eb02082bSJunchao Zhang     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op");
839eb02082bSJunchao Zhang   }
840eb02082bSJunchao Zhang #endif
841eb02082bSJunchao Zhang   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype);
84240e23c03SJunchao Zhang   PetscFunctionReturn(0);
84340e23c03SJunchao Zhang }
84440e23c03SJunchao Zhang 
84540e23c03SJunchao Zhang /*
846eb02082bSJunchao Zhang   Create pack/unpack optimization plans based on indice patterns available
84740e23c03SJunchao Zhang 
84840e23c03SJunchao Zhang    Input Parameters:
849b23bfdefSJunchao Zhang   +  n       - Number of target ranks
850eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
851b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
85240e23c03SJunchao Zhang 
85340e23c03SJunchao Zhang    Output Parameters:
854eb02082bSJunchao Zhang   +  opt    - Optimization plans. Maybe NULL if no optimization can be built.
85540e23c03SJunchao Zhang */
856eb02082bSJunchao Zhang PetscErrorCode PetscSFPackOptCreate(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
85740e23c03SJunchao Zhang {
85840e23c03SJunchao Zhang   PetscErrorCode ierr;
85940e23c03SJunchao Zhang   PetscInt       i,j,k,n_copies,tot_copies=0,step;
860b23bfdefSJunchao Zhang   PetscBool      strided,optimized=PETSC_FALSE;
86140e23c03SJunchao Zhang   PetscSFPackOpt opt;
86240e23c03SJunchao Zhang 
86340e23c03SJunchao Zhang   PetscFunctionBegin;
864b23bfdefSJunchao Zhang   if (!n) {
865b23bfdefSJunchao Zhang     *out = NULL;
866b23bfdefSJunchao Zhang     PetscFunctionReturn(0);
867b23bfdefSJunchao Zhang   }
868b23bfdefSJunchao Zhang 
86940e23c03SJunchao Zhang   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
870b23bfdefSJunchao Zhang   ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr);
871b23bfdefSJunchao Zhang   ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr);
872b23bfdefSJunchao Zhang   if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/
873b23bfdefSJunchao Zhang 
874b23bfdefSJunchao Zhang   opt->n = n;
87540e23c03SJunchao Zhang 
87640e23c03SJunchao Zhang   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */
87740e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor */
87840e23c03SJunchao Zhang     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
87940e23c03SJunchao Zhang     n_copies = 1;
88040e23c03SJunchao Zhang     for (j=offset[i]; j<offset[i+1]-1; j++) {
88140e23c03SJunchao Zhang       if (idx[j]+1 != idx[j+1]) n_copies++;
88240e23c03SJunchao Zhang     }
88340e23c03SJunchao Zhang     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
88440e23c03SJunchao Zhang        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
88540e23c03SJunchao Zhang      */
88640e23c03SJunchao Zhang     if ((offset[i+1]-offset[i])/n_copies >= 32) {
887b23bfdefSJunchao Zhang       opt->type[i] = PETSCSF_PACKOPT_MULTICOPY;
888b23bfdefSJunchao Zhang       optimized    = PETSC_TRUE;
88940e23c03SJunchao Zhang       tot_copies  += n_copies;
89040e23c03SJunchao Zhang     }
89140e23c03SJunchao Zhang   }
89240e23c03SJunchao Zhang 
89340e23c03SJunchao Zhang   /* Setup memcpy plan for each contiguous piece */
89440e23c03SJunchao Zhang   k    = 0; /* k-th copy */
895b23bfdefSJunchao Zhang   ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
896b23bfdefSJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor */
897b23bfdefSJunchao Zhang     if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) {
89840e23c03SJunchao Zhang       n_copies           = 1;
899b23bfdefSJunchao Zhang       opt->copy_start[k] = offset[i] - offset[0];
90040e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
90140e23c03SJunchao Zhang         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
90240e23c03SJunchao Zhang           n_copies++;
903b23bfdefSJunchao Zhang           opt->copy_start[k+1] = j-offset[0]+1;
904b23bfdefSJunchao Zhang           opt->copy_length[k]  = opt->copy_start[k+1] - opt->copy_start[k];
90540e23c03SJunchao Zhang           k++;
90640e23c03SJunchao Zhang         }
90740e23c03SJunchao Zhang       }
90840e23c03SJunchao Zhang       /* Set copy length of the last copy for this target */
909b23bfdefSJunchao Zhang       opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k];
91040e23c03SJunchao Zhang       k++;
91140e23c03SJunchao Zhang     }
912b23bfdefSJunchao Zhang     /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */
91340e23c03SJunchao Zhang     opt->copy_offset[i+1] = k;
91440e23c03SJunchao Zhang   }
91540e23c03SJunchao Zhang 
91640e23c03SJunchao Zhang   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
91740e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each remote */
918b23bfdefSJunchao Zhang     if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
91940e23c03SJunchao Zhang       strided = PETSC_TRUE;
92040e23c03SJunchao Zhang       step    = idx[offset[i]+1] - idx[offset[i]];
92140e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
92240e23c03SJunchao Zhang         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
92340e23c03SJunchao Zhang       }
92440e23c03SJunchao Zhang       if (strided) {
925b23bfdefSJunchao Zhang         opt->type[i]         = PETSCSF_PACKOPT_STRIDE;
92640e23c03SJunchao Zhang         opt->stride_step[i]  = step;
92740e23c03SJunchao Zhang         opt->stride_n[i]     = offset[i+1] - offset[i];
928b23bfdefSJunchao Zhang         optimized            = PETSC_TRUE;
92940e23c03SJunchao Zhang       }
93040e23c03SJunchao Zhang     }
93140e23c03SJunchao Zhang   }
932b23bfdefSJunchao Zhang   /* If no rank gets optimized, free arrays to save memory */
933b23bfdefSJunchao Zhang   if (!optimized) {
934b23bfdefSJunchao Zhang     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
935b23bfdefSJunchao Zhang     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
93640e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
93740e23c03SJunchao Zhang     *out = NULL;
93840e23c03SJunchao Zhang   } else *out = opt;
93940e23c03SJunchao Zhang   PetscFunctionReturn(0);
94040e23c03SJunchao Zhang }
94140e23c03SJunchao Zhang 
94264f49babSJed Brown PetscErrorCode PetscSFPackOptDestroy(PetscSFPackOpt *out)
94340e23c03SJunchao Zhang {
94440e23c03SJunchao Zhang   PetscErrorCode ierr;
94540e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
94640e23c03SJunchao Zhang 
94740e23c03SJunchao Zhang   PetscFunctionBegin;
94840e23c03SJunchao Zhang   if (opt) {
949b23bfdefSJunchao Zhang     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
950b23bfdefSJunchao Zhang     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
95140e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
95240e23c03SJunchao Zhang     *out = NULL;
95340e23c03SJunchao Zhang   }
95440e23c03SJunchao Zhang   PetscFunctionReturn(0);
95540e23c03SJunchao Zhang }
956