xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision b23bfdefca792e3d9f47034521b8d4c437693123)
140e23c03SJunchao Zhang 
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
540e23c03SJunchao Zhang /*
640e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
740e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
840e23c03SJunchao Zhang  */
940e23c03SJunchao Zhang 
10*b23bfdefSJunchao Zhang #define CPPJoin2(a,b)         a ##_## b
11*b23bfdefSJunchao Zhang #define CPPJoin3(a,b,c)       a ##_## b ##_## c
12*b23bfdefSJunchao Zhang #define CPPJoin4_(a,b,c,d)    a##b##_##c##_##d
13*b23bfdefSJunchao Zhang #define CPPJoin4(a,b,c,d)     CPPJoin4_(a##_,b,c,d)
1440e23c03SJunchao Zhang 
1540e23c03SJunchao Zhang #define EXECUTE(statement)    statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */
1640e23c03SJunchao Zhang #define IGNORE(statement)     do {} while(0)
1740e23c03SJunchao Zhang 
1840e23c03SJunchao Zhang #define BINARY_OP(r,s,op,t)   do {(r) = (s) op (t);  } while(0)  /* binary ops in the middle such as +, *, && etc. */
1940e23c03SJunchao Zhang #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0)  /* ops like a function, such as PetscMax, PetscMin */
2040e23c03SJunchao Zhang #define LXOR_OP(r,s,op,t)     do {(r) = (!s) != (!t);} while(0)  /* logical exclusive OR */
2140e23c03SJunchao Zhang #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0)
2240e23c03SJunchao Zhang 
23*b23bfdefSJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */
2440e23c03SJunchao Zhang 
2540e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
2640e23c03SJunchao Zhang 
2740e23c03SJunchao Zhang    Arguments of the macro:
28*b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
29*b23bfdefSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bs.
30*b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
3140e23c03SJunchao Zhang 
3240e23c03SJunchao Zhang    Arguments of the Pack routine:
33*b23bfdefSJunchao Zhang    +count     Number of indices in idx[]
34*b23bfdefSJunchao Zhang    .idx       Indices of entries to packed. NULL means contiguous indices, that is [0,count)
3540e23c03SJunchao Zhang    .bs        Number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int
36*b23bfdefSJunchao Zhang    .opt       Pack optimization plans. NULL means no plan at all.
37*b23bfdefSJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count)
38*b23bfdefSJunchao Zhang    -packed    Address of the packed data for each rank
3940e23c03SJunchao Zhang  */
40*b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \
41*b23bfdefSJunchao Zhang   static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,const void *unpacked,void *packed) \
42*b23bfdefSJunchao Zhang   {                                                                                                          \
4340e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
44*b23bfdefSJunchao Zhang     const Type     *u = (const Type*)unpacked,*u2;                                                           \
45*b23bfdefSJunchao Zhang     Type           *p = (Type*)packed,*p2;                                                                   \
46*b23bfdefSJunchao Zhang     PetscInt       i,j,k,l,r,step;                                                                           \
47*b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
48*b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
4940e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
50*b23bfdefSJunchao Zhang     if (!idx) {ierr = PetscArraycpy(p,u,MBS*count);CHKERRQ(ierr);}  /* Indices are contiguous */             \
51*b23bfdefSJunchao Zhang     else if (!opt) { /* No optimizations available */                                                        \
52*b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
53*b23bfdefSJunchao Zhang         for (j=0; j<M; j++)     /* Decent compilers should fuse this loop when M = const 1 */                \
54*b23bfdefSJunchao Zhang           for (k=0; k<BS; k++)  /* Compiler either fuses (BS=1) or vectorizes (BS=2,4,8,etc) this loop */    \
55*b23bfdefSJunchao Zhang             p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k];                                                          \
56*b23bfdefSJunchao Zhang     } else {                                                                                                 \
57*b23bfdefSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
58*b23bfdefSJunchao Zhang         p2  = p + opt->offset[r]*MBS;                                                                        \
59*b23bfdefSJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
60*b23bfdefSJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
61*b23bfdefSJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
62*b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
63*b23bfdefSJunchao Zhang               for (k=0; k<BS; k++)                                                                           \
64*b23bfdefSJunchao Zhang                 p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k];                                                    \
65*b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
6640e23c03SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
67*b23bfdefSJunchao Zhang             u2   = u + idx[opt->copy_start[i]]*MBS;                                                          \
68*b23bfdefSJunchao Zhang             l    = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */                       \
69*b23bfdefSJunchao Zhang             ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr);                                                     \
70*b23bfdefSJunchao Zhang             p2  += l;                                                                                        \
7140e23c03SJunchao Zhang           }                                                                                                  \
72*b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
73*b23bfdefSJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
7440e23c03SJunchao Zhang           step = opt->stride_step[r];                                                                        \
7540e23c03SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
76*b23bfdefSJunchao Zhang             for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j];                                            \
77*b23bfdefSJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
7840e23c03SJunchao Zhang       }                                                                                                      \
7940e23c03SJunchao Zhang     }                                                                                                        \
8040e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
8140e23c03SJunchao Zhang   }
8240e23c03SJunchao Zhang 
8340e23c03SJunchao Zhang /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine
8440e23c03SJunchao Zhang 
8540e23c03SJunchao Zhang    Arguments:
8640e23c03SJunchao Zhang   +action     Unpack or Fetch
87*b23bfdefSJunchao Zhang   .Type       Type of the data
8840e23c03SJunchao Zhang   .BS         Block size for vectorization
89*b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
9040e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
91*b23bfdefSJunchao Zhang   .CType      Type with or without the const qualifier, i.e., const Type or Type
92*b23bfdefSJunchao Zhang   .Cvoid      void with or without the const qualifier, i.e., const void or void
9340e23c03SJunchao Zhang 
9440e23c03SJunchao Zhang   Notes:
9540e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
96*b23bfdefSJunchao Zhang    The two arguments CType and Cvoid are used (instead of one constness argument), because we want to
9740e23c03SJunchao Zhang    get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument,
9840e23c03SJunchao Zhang    sometimes we input 'const', sometimes we have to input empty.
99*b23bfdefSJunchao Zhang 
100*b23bfdefSJunchao Zhang    If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare.
10140e23c03SJunchao Zhang  */
102*b23bfdefSJunchao Zhang #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid)               \
103*b23bfdefSJunchao Zhang   static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
104*b23bfdefSJunchao Zhang   {                                                                                                          \
10540e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                     \
106*b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2;                                                                 \
107*b23bfdefSJunchao Zhang     CType          *p = (CType*)packed,*p2;                                                                  \
108*b23bfdefSJunchao Zhang     PetscInt       i,j,k,l,r,step;                                                                           \
109*b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
110*b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
11140e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
112*b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
113*b23bfdefSJunchao Zhang       FILTER(Type *v);                                                                                       \
114*b23bfdefSJunchao Zhang       FILTER(ierr = PetscMalloc1(count*MBS,&v);CHKERRQ(ierr));                                               \
115*b23bfdefSJunchao Zhang       FILTER(ierr = PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr));                                             \
116*b23bfdefSJunchao Zhang              ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);                                              \
117*b23bfdefSJunchao Zhang       FILTER(ierr = PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr));                                             \
11840e23c03SJunchao Zhang       FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                             \
119*b23bfdefSJunchao Zhang     } else if (!opt) { /* No optimizations available */                                                      \
120*b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
121*b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
122*b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
123*b23bfdefSJunchao Zhang             FILTER(Type t                = u[idx[i]*MBS+j*BS+k]);                                            \
124*b23bfdefSJunchao Zhang                    u[idx[i]*MBS+j*BS+k]  = p[i*MBS+j*BS+k];                                                  \
125*b23bfdefSJunchao Zhang             FILTER(p[i*MBS+j*BS+k]       = t);                                                               \
12640e23c03SJunchao Zhang           }                                                                                                  \
127*b23bfdefSJunchao Zhang     } else {                                                                                                 \
128*b23bfdefSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
129*b23bfdefSJunchao Zhang         p2 = p + opt->offset[r]*MBS;                                                                         \
130*b23bfdefSJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
131*b23bfdefSJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
132*b23bfdefSJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
133*b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
134*b23bfdefSJunchao Zhang               for (k=0; k<BS; k++) {                                                                         \
135*b23bfdefSJunchao Zhang                 FILTER(Type t                = u[idx2[i]*MBS+j*BS+k]);                                       \
136*b23bfdefSJunchao Zhang                        u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
137*b23bfdefSJunchao Zhang                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
13840e23c03SJunchao Zhang               }                                                                                              \
139*b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
140*b23bfdefSJunchao Zhang           FILTER(Type *v);                                                                                   \
141*b23bfdefSJunchao Zhang           FILTER(ierr = PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \
14240e23c03SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
143*b23bfdefSJunchao Zhang             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
144*b23bfdefSJunchao Zhang             l  = opt->copy_length[i]*MBS;                                                                    \
145da2e4c71SJunchao Zhang             FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr));                                              \
146*b23bfdefSJunchao Zhang                    ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr);                                              \
147*b23bfdefSJunchao Zhang             FILTER(ierr = PetscArraycpy(p2,v,l);CHKERRQ(ierr));                                              \
148*b23bfdefSJunchao Zhang             p2 += l;                                                                                         \
14940e23c03SJunchao Zhang           }                                                                                                  \
15040e23c03SJunchao Zhang           FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                         \
151*b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
152*b23bfdefSJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
15340e23c03SJunchao Zhang           step = opt->stride_step[r];                                                                        \
15440e23c03SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
155*b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
156*b23bfdefSJunchao Zhang               for (k=0; k<BS; k++) {                                                                         \
157*b23bfdefSJunchao Zhang                 FILTER(Type t                = u2[i*step*MBS+j*BS+k]);                                       \
158*b23bfdefSJunchao Zhang                        u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k];                                             \
159*b23bfdefSJunchao Zhang                 FILTER(p2[i*MBS+j*BS+k]      = t);                                                           \
16040e23c03SJunchao Zhang               }                                                                                              \
161*b23bfdefSJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
16240e23c03SJunchao Zhang       }                                                                                                      \
16340e23c03SJunchao Zhang     }                                                                                                        \
16440e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
16540e23c03SJunchao Zhang   }
16640e23c03SJunchao Zhang 
16740e23c03SJunchao Zhang /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc
16840e23c03SJunchao Zhang 
16940e23c03SJunchao Zhang    Arguments:
17040e23c03SJunchao Zhang   +action     Unpack or Fetch
17140e23c03SJunchao Zhang   .opname     Name of the Op, such as Add, Mult, LAND, etc.
172*b23bfdefSJunchao Zhang   .Type       Type of the data
17340e23c03SJunchao Zhang   .BS         Block size for vectorization
174*b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
17540e23c03SJunchao Zhang   .op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
17640e23c03SJunchao Zhang   .APPLY      Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP
17740e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
178*b23bfdefSJunchao Zhang   .CType      Type with or without the const qualifier, i.e., const Type or Type
179*b23bfdefSJunchao Zhang   -Cvoid      void with or without the const qualifier, i.e., const void or void
18040e23c03SJunchao Zhang  */
181*b23bfdefSJunchao Zhang #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \
182*b23bfdefSJunchao Zhang   static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \
183*b23bfdefSJunchao Zhang   {                                                                                                          \
184*b23bfdefSJunchao Zhang     Type           *u = (Type*)unpacked,*u2,t;                                                               \
185*b23bfdefSJunchao Zhang     CType          *p = (CType*)packed,*p2;                                                                  \
186*b23bfdefSJunchao Zhang     PetscInt       i,j,k,l,r,step;                                                                           \
187*b23bfdefSJunchao Zhang     const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */    \
188*b23bfdefSJunchao Zhang     const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */                \
18940e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                      \
190*b23bfdefSJunchao Zhang     if (!idx) {                                                                                              \
191*b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
192*b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
193*b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
194*b23bfdefSJunchao Zhang             t    = u[i*MBS+j*BS+k];                                                                          \
195*b23bfdefSJunchao Zhang             APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                                    \
196*b23bfdefSJunchao Zhang             FILTER(p[i*MBS+j*BS+k] = t);                                                                     \
19740e23c03SJunchao Zhang           }                                                                                                  \
198*b23bfdefSJunchao Zhang     } else if (!opt) { /* No optimizations available */                                                      \
199*b23bfdefSJunchao Zhang       for (i=0; i<count; i++)                                                                                \
200*b23bfdefSJunchao Zhang         for (j=0; j<M; j++)                                                                                  \
201*b23bfdefSJunchao Zhang           for (k=0; k<BS; k++) {                                                                             \
202*b23bfdefSJunchao Zhang               t    = u[idx[i]*MBS+j*BS+k];                                                                   \
203*b23bfdefSJunchao Zhang               APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]);                                             \
204*b23bfdefSJunchao Zhang               FILTER(p[i*MBS+j*BS+k] = t);                                                                   \
20540e23c03SJunchao Zhang           }                                                                                                  \
206*b23bfdefSJunchao Zhang     } else {                                                                                                 \
207*b23bfdefSJunchao Zhang       for (r=0; r<opt->n; r++) {                                                                             \
208*b23bfdefSJunchao Zhang         p2 = p + opt->offset[r]*MBS;                                                                         \
209*b23bfdefSJunchao Zhang         if (opt->type[r] == PETSCSF_PACKOPT_NONE) {                                                          \
210*b23bfdefSJunchao Zhang           idx2 = idx + opt->offset[r];                                                                       \
211*b23bfdefSJunchao Zhang           for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++)                                                  \
212*b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
213*b23bfdefSJunchao Zhang               for (k=0; k<BS; k++) {                                                                         \
214*b23bfdefSJunchao Zhang                 t    = u[idx2[i]*MBS+j*BS+k];                                                                \
215*b23bfdefSJunchao Zhang                 APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
216*b23bfdefSJunchao Zhang                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
21740e23c03SJunchao Zhang               }                                                                                              \
218*b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) {                                              \
21940e23c03SJunchao Zhang           for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
220*b23bfdefSJunchao Zhang             u2 = u + idx[opt->copy_start[i]]*MBS;                                                            \
221*b23bfdefSJunchao Zhang             l  = opt->copy_length[i]*MBS;                                                                    \
22240e23c03SJunchao Zhang             for (j=0; j<l; j++) {                                                                            \
22340e23c03SJunchao Zhang               t    = u2[j];                                                                                  \
224*b23bfdefSJunchao Zhang               APPLY (u2[j],t,op,p2[j]);                                                                      \
225*b23bfdefSJunchao Zhang               FILTER(p2[j] = t);                                                                             \
22640e23c03SJunchao Zhang             }                                                                                                \
227*b23bfdefSJunchao Zhang             p2 += l;                                                                                         \
22840e23c03SJunchao Zhang           }                                                                                                  \
229*b23bfdefSJunchao Zhang         } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) {                                                 \
230*b23bfdefSJunchao Zhang           u2   = u + idx[opt->offset[r]]*MBS;                                                                \
23140e23c03SJunchao Zhang           step = opt->stride_step[r];                                                                        \
23240e23c03SJunchao Zhang           for (i=0; i<opt->stride_n[r]; i++)                                                                 \
233*b23bfdefSJunchao Zhang             for (j=0; j<M; j++)                                                                              \
234*b23bfdefSJunchao Zhang               for (k=0; k<BS; k++) {                                                                         \
235*b23bfdefSJunchao Zhang                 t    = u2[i*step*MBS+j*BS+k];                                                                \
236*b23bfdefSJunchao Zhang                 APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]);                                         \
237*b23bfdefSJunchao Zhang                 FILTER(p2[i*MBS+j*BS+k] = t);                                                                \
23840e23c03SJunchao Zhang               }                                                                                              \
239*b23bfdefSJunchao Zhang         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]);   \
24040e23c03SJunchao Zhang       }                                                                                                      \
24140e23c03SJunchao Zhang     }                                                                                                        \
24240e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
24340e23c03SJunchao Zhang   }
24440e23c03SJunchao Zhang 
24540e23c03SJunchao Zhang /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine
24640e23c03SJunchao Zhang 
24740e23c03SJunchao Zhang    Arguments:
24840e23c03SJunchao Zhang   +Action     Unpack or Fetch
24940e23c03SJunchao Zhang   .locname    Max or Min
25040e23c03SJunchao Zhang   .type1      Type of the first data in a pair type
25140e23c03SJunchao Zhang   .type2      Type of the second data in a pair type, usually PetscMPIInt for MPI ranks.
25240e23c03SJunchao Zhang   .op         > or <
25340e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
254*b23bfdefSJunchao Zhang   .CType      Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2)
255*b23bfdefSJunchao Zhang   -Cvoid      void with or without the const qualifier, i.e., const void or void
25640e23c03SJunchao Zhang  */
257*b23bfdefSJunchao Zhang #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \
258*b23bfdefSJunchao Zhang   static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \
259*b23bfdefSJunchao Zhang     PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked;                                             \
260*b23bfdefSJunchao Zhang     CType                 *p = (CType*)packed;                                                               \
261*b23bfdefSJunchao Zhang     PetscInt              i,j;                                                                               \
262*b23bfdefSJunchao Zhang     for (i=0; i<count; i++) {                                                                                \
263*b23bfdefSJunchao Zhang       j = idx? idx[i] : i;                                                                                   \
264*b23bfdefSJunchao Zhang       FILTER(PairType(Type1,Type2) v = u[j]);                                                                \
26540e23c03SJunchao Zhang       if (p[i].a op u[j].a) {                                                                                \
26640e23c03SJunchao Zhang         u[j] = p[i];                                                                                         \
26740e23c03SJunchao Zhang       } else if (p[i].a == u[j].a) {                                                                         \
26840e23c03SJunchao Zhang         u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */                                 \
26940e23c03SJunchao Zhang       }                                                                                                      \
27040e23c03SJunchao Zhang       FILTER(p[i] = v);                                                                                      \
27140e23c03SJunchao Zhang     }                                                                                                        \
27240e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                  \
27340e23c03SJunchao Zhang   }
27440e23c03SJunchao Zhang 
275*b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
276*b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ)                                                                   \
277*b23bfdefSJunchao Zhang   DEF_PackFunc(Type,BS,EQ)                                                                     \
278*b23bfdefSJunchao Zhang   DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void)                                   \
279*b23bfdefSJunchao Zhang   DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void)                                              \
280*b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) {                           \
281*b23bfdefSJunchao Zhang     link->Pack            = CPPJoin4(Pack,           Type,BS,EQ);                              \
282*b23bfdefSJunchao Zhang     link->UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ);                              \
283*b23bfdefSJunchao Zhang     link->FetchAndInsert  = CPPJoin4(FetchAndInsert, Type,BS,EQ);                              \
284*b23bfdefSJunchao Zhang     link->unitbytes       = sizeof(Type);                                                      \
28540e23c03SJunchao Zhang   }
28640e23c03SJunchao Zhang 
287*b23bfdefSJunchao Zhang /* Add, Mult ops */
288*b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ)                                                                    \
289*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void)             \
290*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void)             \
291*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void)                        \
292*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void)                        \
293*b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) {                            \
294*b23bfdefSJunchao Zhang     link->UnpackAndAdd    = CPPJoin4(UnpackAndAdd,   Type,BS,EQ);                              \
295*b23bfdefSJunchao Zhang     link->UnpackAndMult   = CPPJoin4(UnpackAndMult,  Type,BS,EQ);                              \
296*b23bfdefSJunchao Zhang     link->FetchAndAdd     = CPPJoin4(FetchAndAdd,    Type,BS,EQ);                              \
297*b23bfdefSJunchao Zhang     link->FetchAndMult    = CPPJoin4(FetchAndMult,   Type,BS,EQ);                              \
29840e23c03SJunchao Zhang   }
29940e23c03SJunchao Zhang 
300*b23bfdefSJunchao Zhang /* Max, Min ops */
301*b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ)                                                                    \
302*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void)     \
303*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void)     \
304*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void)                \
305*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void)                \
306*b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) {                        \
307*b23bfdefSJunchao Zhang     link->UnpackAndMax    = CPPJoin4(UnpackAndMax,   Type,BS,EQ);                              \
308*b23bfdefSJunchao Zhang     link->UnpackAndMin    = CPPJoin4(UnpackAndMin,   Type,BS,EQ);                              \
309*b23bfdefSJunchao Zhang     link->FetchAndMax     = CPPJoin4(FetchAndMax ,   Type,BS,EQ);                              \
310*b23bfdefSJunchao Zhang     link->FetchAndMin     = CPPJoin4(FetchAndMin ,   Type,BS,EQ);                              \
311*b23bfdefSJunchao Zhang   }
312*b23bfdefSJunchao Zhang 
313*b23bfdefSJunchao Zhang /* Logical ops.
314*b23bfdefSJunchao Zhang   The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid
31540e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
31640e23c03SJunchao Zhang  */
317*b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ)                                                                    \
318*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void)            \
319*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void)            \
320*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP,  IGNORE,const Type,const void)            \
321*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void)                       \
322*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void)                       \
323*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP,  EXECUTE,Type,void)                       \
324*b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) {                        \
325*b23bfdefSJunchao Zhang     link->UnpackAndLAND   = CPPJoin4(UnpackAndLAND,Type,BS,EQ);                                \
326*b23bfdefSJunchao Zhang     link->UnpackAndLOR    = CPPJoin4(UnpackAndLOR, Type,BS,EQ);                                \
327*b23bfdefSJunchao Zhang     link->UnpackAndLXOR   = CPPJoin4(UnpackAndLXOR,Type,BS,EQ);                                \
328*b23bfdefSJunchao Zhang     link->FetchAndLAND    = CPPJoin4(FetchAndLAND, Type,BS,EQ);                                \
329*b23bfdefSJunchao Zhang     link->FetchAndLOR     = CPPJoin4(FetchAndLOR,  Type,BS,EQ);                                \
330*b23bfdefSJunchao Zhang     link->FetchAndLXOR    = CPPJoin4(FetchAndLXOR, Type,BS,EQ);                                \
33140e23c03SJunchao Zhang   }
33240e23c03SJunchao Zhang 
333*b23bfdefSJunchao Zhang /* Bitwise ops */
334*b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ)                                                                    \
335*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void)             \
336*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void)             \
337*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void)             \
338*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void)                        \
339*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void)                        \
340*b23bfdefSJunchao Zhang   DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void)                        \
341*b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) {                        \
342*b23bfdefSJunchao Zhang     link->UnpackAndBAND   = CPPJoin4(UnpackAndBAND,Type,BS,EQ);                                \
343*b23bfdefSJunchao Zhang     link->UnpackAndBOR    = CPPJoin4(UnpackAndBOR, Type,BS,EQ);                                \
344*b23bfdefSJunchao Zhang     link->UnpackAndBXOR   = CPPJoin4(UnpackAndBXOR,Type,BS,EQ);                                \
345*b23bfdefSJunchao Zhang     link->FetchAndBAND    = CPPJoin4(FetchAndBAND, Type,BS,EQ);                                \
346*b23bfdefSJunchao Zhang     link->FetchAndBOR     = CPPJoin4(FetchAndBOR,  Type,BS,EQ);                                \
347*b23bfdefSJunchao Zhang     link->FetchAndBXOR    = CPPJoin4(FetchAndBXOR, Type,BS,EQ);                                \
34840e23c03SJunchao Zhang   }
34940e23c03SJunchao Zhang 
350*b23bfdefSJunchao Zhang /* Maxloc, Minloc */
351*b23bfdefSJunchao Zhang #define DEF_Xloc(Type1,Type2)                                                                  \
352*b23bfdefSJunchao Zhang   DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void)    \
353*b23bfdefSJunchao Zhang   DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void)    \
354*b23bfdefSJunchao Zhang   DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void)               \
355*b23bfdefSJunchao Zhang   DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void)               \
356*b23bfdefSJunchao Zhang   static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) {                          \
357*b23bfdefSJunchao Zhang     link->UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1);               \
358*b23bfdefSJunchao Zhang     link->UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1);               \
359*b23bfdefSJunchao Zhang     link->FetchAndMaxloc  = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1);               \
360*b23bfdefSJunchao Zhang     link->FetchAndMinloc  = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1);               \
36140e23c03SJunchao Zhang   }
36240e23c03SJunchao Zhang 
363*b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ)                                                            \
364*b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                                         \
365*b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                                          \
366*b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                                          \
367*b23bfdefSJunchao Zhang   DEF_Log(Type,BS,EQ)                                                                          \
368*b23bfdefSJunchao Zhang   DEF_Bit(Type,BS,EQ)                                                                          \
369*b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) {                    \
370*b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
371*b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
372*b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
373*b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                               \
374*b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link);                                               \
37540e23c03SJunchao Zhang   }
37640e23c03SJunchao Zhang 
377*b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ)                                                               \
378*b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                                         \
379*b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                                          \
380*b23bfdefSJunchao Zhang   DEF_Cmp(Type,BS,EQ)                                                                          \
381*b23bfdefSJunchao Zhang   DEF_Log(Type,BS,EQ)                                                                          \
382*b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) {                       \
383*b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
384*b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
385*b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare,Type,BS,EQ)(link);                                               \
386*b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical,Type,BS,EQ)(link);                                               \
387*b23bfdefSJunchao Zhang   }
38840e23c03SJunchao Zhang 
38940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
390*b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ)                                                            \
391*b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                                         \
392*b23bfdefSJunchao Zhang   DEF_Add(Type,BS,EQ)                                                                          \
393*b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) {                    \
394*b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
395*b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add,Type,BS,EQ)(link);                                                   \
396*b23bfdefSJunchao Zhang   }
39740e23c03SJunchao Zhang #endif
398*b23bfdefSJunchao Zhang 
399*b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ)                                                               \
400*b23bfdefSJunchao Zhang   DEF_Pack(Type,BS,EQ)                                                                         \
401*b23bfdefSJunchao Zhang   static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) {                       \
402*b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,Type,BS,EQ)(link);                                                  \
403*b23bfdefSJunchao Zhang   }
404*b23bfdefSJunchao Zhang 
405*b23bfdefSJunchao Zhang /* Maxloc, Minloc */
406*b23bfdefSJunchao Zhang #define DEF_PairType(Type1,Type2)                                                              \
407*b23bfdefSJunchao Zhang   typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2);                                    \
408*b23bfdefSJunchao Zhang   DEF_Pack(PairType(Type1,Type2),1,1)                                                          \
409*b23bfdefSJunchao Zhang   DEF_Xloc(Type1,Type2)                                                                        \
410*b23bfdefSJunchao Zhang   static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) {                      \
411*b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link);                                   \
412*b23bfdefSJunchao Zhang     CPPJoin3(PackInit_Xloc,Type1,Type2)(link);                                                 \
413*b23bfdefSJunchao Zhang   }
414*b23bfdefSJunchao Zhang 
415*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT  */
416*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
417*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
418*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
419*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
420*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
421*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
422*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
423*b23bfdefSJunchao Zhang 
424*b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
425*b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1)
426*b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1)
427*b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1)
428*b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1)
429*b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0)
430*b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0)
431*b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0)
432*b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0)
433*b23bfdefSJunchao Zhang #endif
434*b23bfdefSJunchao Zhang 
435*b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */
436*b23bfdefSJunchao Zhang typedef signed char SignedChar;
437*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1)
438*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1)
439*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1)
440*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1)
441*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0)
442*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0)
443*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0)
444*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0)
445*b23bfdefSJunchao Zhang 
446*b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar;
447*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1)
448*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1)
449*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1)
450*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1)
451*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0)
452*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0)
453*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0)
454*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0)
455*b23bfdefSJunchao Zhang 
456*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1)
457*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1)
458*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1)
459*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1)
460*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0)
461*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0)
462*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0)
463*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0)
464*b23bfdefSJunchao Zhang 
465*b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
466*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1)
467*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1)
468*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1)
469*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1)
470*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0)
471*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0)
472*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0)
473*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0)
474*b23bfdefSJunchao Zhang #endif
475*b23bfdefSJunchao Zhang 
476*b23bfdefSJunchao Zhang DEF_PairType(int,int)
477*b23bfdefSJunchao Zhang DEF_PairType(PetscInt,PetscInt)
478*b23bfdefSJunchao Zhang 
479*b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */
480*b23bfdefSJunchao Zhang DEF_DumbType(char,1,1)
481*b23bfdefSJunchao Zhang DEF_DumbType(char,2,1)
482*b23bfdefSJunchao Zhang DEF_DumbType(char,4,1)
483*b23bfdefSJunchao Zhang DEF_DumbType(char,1,0)
484*b23bfdefSJunchao Zhang DEF_DumbType(char,2,0)
485*b23bfdefSJunchao Zhang DEF_DumbType(char,4,0)
486*b23bfdefSJunchao Zhang 
487*b23bfdefSJunchao Zhang typedef int DumbInt; /* To differentiate with regular int used above */
488*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1)
489*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1)
490*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1)
491*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1)
492*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0)
493*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0)
494*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0)
495*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0)
49640e23c03SJunchao Zhang 
49740e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
49840e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
49940e23c03SJunchao Zhang {
50040e23c03SJunchao Zhang   int ierr;
50140e23c03SJunchao Zhang   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
50240e23c03SJunchao Zhang   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
50340e23c03SJunchao Zhang   return MPI_SUCCESS;
50440e23c03SJunchao Zhang }
50540e23c03SJunchao Zhang #endif
50640e23c03SJunchao Zhang 
5079d1c8addSJunchao Zhang PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink)
50840e23c03SJunchao Zhang {
50940e23c03SJunchao Zhang   PetscErrorCode    ierr;
51040e23c03SJunchao Zhang   PetscSFPack       link,*p;
51140e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
51240e23c03SJunchao Zhang 
51340e23c03SJunchao Zhang   PetscFunctionBegin;
51440e23c03SJunchao Zhang   /* Look for types in cache */
51540e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
51640e23c03SJunchao Zhang     PetscBool match;
51740e23c03SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
5189d1c8addSJunchao Zhang     if (match && (rkey == link->rkey) && (lkey == link->lkey)) {
51940e23c03SJunchao Zhang       switch (cmode) {
52040e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
52140e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
52240e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
52340e23c03SJunchao Zhang       }
52440e23c03SJunchao Zhang       *mylink = link;
52540e23c03SJunchao Zhang       PetscFunctionReturn(0);
52640e23c03SJunchao Zhang     }
52740e23c03SJunchao Zhang   }
52840e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
52940e23c03SJunchao Zhang   PetscFunctionReturn(0);
53040e23c03SJunchao Zhang }
53140e23c03SJunchao Zhang 
53240e23c03SJunchao Zhang PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link)
53340e23c03SJunchao Zhang {
53440e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
53540e23c03SJunchao Zhang 
53640e23c03SJunchao Zhang   PetscFunctionBegin;
5379d1c8addSJunchao Zhang   (*link)->rkey = NULL;
5389d1c8addSJunchao Zhang   (*link)->lkey = NULL;
53940e23c03SJunchao Zhang   (*link)->next = bas->avail;
54040e23c03SJunchao Zhang   bas->avail    = *link;
54140e23c03SJunchao Zhang   *link         = NULL;
54240e23c03SJunchao Zhang   PetscFunctionReturn(0);
54340e23c03SJunchao Zhang }
54440e23c03SJunchao Zhang 
5459d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
5469d1c8addSJunchao Zhang PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey)
5479d1c8addSJunchao Zhang {
5489d1c8addSJunchao Zhang   PetscErrorCode    ierr;
5499d1c8addSJunchao Zhang   PetscSFPack       link,*p;
5509d1c8addSJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
5519d1c8addSJunchao Zhang   PetscBool         match;
5529d1c8addSJunchao Zhang 
5539d1c8addSJunchao Zhang   PetscFunctionBegin;
55418fb5014SJunchao Zhang   /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
55518fb5014SJunchao Zhang      the potential overlapping since this process does not participate in communication. Overlapping is harmless.
55618fb5014SJunchao Zhang   */
55718fb5014SJunchao Zhang   if (rkey || lkey) {
5589d1c8addSJunchao Zhang     for (p=&bas->inuse; (link=*p); p=&link->next) {
5599d1c8addSJunchao Zhang       ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
56033c49614SJunchao Zhang       if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey);
5619d1c8addSJunchao Zhang     }
56218fb5014SJunchao Zhang   }
5639d1c8addSJunchao Zhang   PetscFunctionReturn(0);
5649d1c8addSJunchao Zhang }
5659d1c8addSJunchao Zhang 
56640e23c03SJunchao Zhang PetscErrorCode PetscSFPackSetupType(PetscSFPack link,MPI_Datatype unit)
56740e23c03SJunchao Zhang {
56840e23c03SJunchao Zhang   PetscErrorCode ierr;
569*b23bfdefSJunchao Zhang   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
570*b23bfdefSJunchao Zhang   PetscBool      is2Int,is2PetscInt;
57140e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
57240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
573*b23bfdefSJunchao Zhang   PetscInt       nPetscComplex=0;
57440e23c03SJunchao Zhang #endif
57540e23c03SJunchao Zhang 
57640e23c03SJunchao Zhang   PetscFunctionBegin;
577*b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
578*b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
579*b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
580*b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
581*b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
582*b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
58340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
584*b23bfdefSJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
58540e23c03SJunchao Zhang #endif
58640e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
58740e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
588*b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
58940e23c03SJunchao Zhang   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
59040e23c03SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE;
591*b23bfdefSJunchao Zhang   link->bs = 1; /* default */
59240e23c03SJunchao Zhang 
593*b23bfdefSJunchao Zhang   if (nPetscReal) {
594*b23bfdefSJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
595*b23bfdefSJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
596*b23bfdefSJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
597*b23bfdefSJunchao Zhang     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
598*b23bfdefSJunchao Zhang     link->bs         = nPetscReal;
599*b23bfdefSJunchao Zhang     link->unitbytes *= nPetscReal;
60040e23c03SJunchao Zhang     link->basicunit  = MPIU_REAL;
601*b23bfdefSJunchao Zhang   } else if (nPetscInt) {
602*b23bfdefSJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
603*b23bfdefSJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
604*b23bfdefSJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
605*b23bfdefSJunchao Zhang     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
606*b23bfdefSJunchao Zhang     link->bs         = nPetscInt;
607*b23bfdefSJunchao Zhang     link->unitbytes *= nPetscInt;
608*b23bfdefSJunchao Zhang     link->basicunit  = MPIU_INT;
609*b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
610*b23bfdefSJunchao Zhang   } else if (nInt) {
611*b23bfdefSJunchao Zhang     if      (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
612*b23bfdefSJunchao Zhang     else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
613*b23bfdefSJunchao Zhang     else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
614*b23bfdefSJunchao Zhang     else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
615*b23bfdefSJunchao Zhang     link->bs         = nInt;
616*b23bfdefSJunchao Zhang     link->unitbytes *= nInt;
617*b23bfdefSJunchao Zhang     link->basicunit  = MPI_INT;
618*b23bfdefSJunchao Zhang #endif
619*b23bfdefSJunchao Zhang   } else if (nSignedChar) {
620*b23bfdefSJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
621*b23bfdefSJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
622*b23bfdefSJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
623*b23bfdefSJunchao Zhang     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
624*b23bfdefSJunchao Zhang     link->bs         = nSignedChar;
625*b23bfdefSJunchao Zhang     link->unitbytes *= nSignedChar;
626*b23bfdefSJunchao Zhang     link->basicunit  = MPI_SIGNED_CHAR;
627*b23bfdefSJunchao Zhang   }  else if (nUnsignedChar) {
628*b23bfdefSJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
629*b23bfdefSJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
630*b23bfdefSJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
631*b23bfdefSJunchao Zhang     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
632*b23bfdefSJunchao Zhang     link->bs         = nUnsignedChar;
633*b23bfdefSJunchao Zhang     link->unitbytes *= nUnsignedChar;
634*b23bfdefSJunchao Zhang     link->basicunit  = MPI_UNSIGNED_CHAR;
63540e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
636*b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
637*b23bfdefSJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
638*b23bfdefSJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
639*b23bfdefSJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
640*b23bfdefSJunchao Zhang     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
641*b23bfdefSJunchao Zhang     link->bs         = nPetscComplex;
642*b23bfdefSJunchao Zhang     link->unitbytes *= nPetscComplex;
64340e23c03SJunchao Zhang     link->basicunit  = MPIU_COMPLEX;
64440e23c03SJunchao Zhang #endif
645*b23bfdefSJunchao Zhang   } else if (is2Int) {
646*b23bfdefSJunchao Zhang     PackInit_PairType_int_int(link);
647*b23bfdefSJunchao Zhang     link->bs         = 1;
648*b23bfdefSJunchao Zhang     link->basicunit  = MPI_2INT;
649*b23bfdefSJunchao Zhang   } else if (is2PetscInt) {
650*b23bfdefSJunchao Zhang     PackInit_PairType_PetscInt_PetscInt(link);
651*b23bfdefSJunchao Zhang     link->bs         = 1;
652*b23bfdefSJunchao Zhang     link->basicunit  = MPIU_2INT;
65340e23c03SJunchao Zhang   } else {
654*b23bfdefSJunchao Zhang     MPI_Aint lb,nbyte;
655*b23bfdefSJunchao Zhang     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr);
65640e23c03SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
657*b23bfdefSJunchao Zhang     if (nbyte % sizeof(DumbInt)) { /* If the type size is not multiple of int */
658*b23bfdefSJunchao Zhang       if      (nbyte == 4) {PackInit_DumbType_char_4_1(link); link->bs = nbyte/4;}  else if (nbyte%4 == 0) {PackInit_DumbType_char_4_0(link); link->bs = nbyte/4;} /* Note the basic type is char[4] */
659*b23bfdefSJunchao Zhang       else if (nbyte == 2) {PackInit_DumbType_char_2_1(link); link->bs = nbyte/2;}  else if (nbyte%2 == 0) {PackInit_DumbType_char_2_0(link); link->bs = nbyte/2;}
660*b23bfdefSJunchao Zhang       else if (nbyte == 1) {PackInit_DumbType_char_1_1(link); link->bs = nbyte/1;}  else if (nbyte%1 == 0) {PackInit_DumbType_char_1_0(link); link->bs = nbyte/1;}
661*b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
662*b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
66340e23c03SJunchao Zhang     } else {
664*b23bfdefSJunchao Zhang       nInt = nbyte / sizeof(DumbInt);
665*b23bfdefSJunchao Zhang       if      (nInt == 8)  {PackInit_DumbType_DumbInt_8_1(link); link->bs = nInt/8;} else if (nInt%8 == 0) {PackInit_DumbType_DumbInt_8_0(link); link->bs = nInt/8;}
666*b23bfdefSJunchao Zhang       else if (nInt == 4)  {PackInit_DumbType_DumbInt_4_1(link); link->bs = nInt/4;} else if (nInt%4 == 0) {PackInit_DumbType_DumbInt_4_0(link); link->bs = nInt/4;}
667*b23bfdefSJunchao Zhang       else if (nInt == 2)  {PackInit_DumbType_DumbInt_2_1(link); link->bs = nInt/2;} else if (nInt%2 == 0) {PackInit_DumbType_DumbInt_2_0(link); link->bs = nInt/2;}
668*b23bfdefSJunchao Zhang       else if (nInt == 1)  {PackInit_DumbType_DumbInt_1_1(link); link->bs = nInt/1;} else if (nInt%1 == 0) {PackInit_DumbType_DumbInt_1_0(link); link->bs = nInt/1;}
669*b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
67040e23c03SJunchao Zhang       link->basicunit = MPI_INT;
67140e23c03SJunchao Zhang     }
67240e23c03SJunchao Zhang   }
673*b23bfdefSJunchao Zhang 
67440e23c03SJunchao Zhang   if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */
67540e23c03SJunchao Zhang   else {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
67640e23c03SJunchao Zhang   PetscFunctionReturn(0);
67740e23c03SJunchao Zhang }
67840e23c03SJunchao Zhang 
679*b23bfdefSJunchao Zhang PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*))
68040e23c03SJunchao Zhang {
68140e23c03SJunchao Zhang   PetscFunctionBegin;
68240e23c03SJunchao Zhang   *UnpackAndOp = NULL;
68340e23c03SJunchao Zhang   if (op == MPIU_REPLACE) *UnpackAndOp = link->UnpackAndInsert;
68440e23c03SJunchao Zhang   else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->UnpackAndAdd;
68540e23c03SJunchao Zhang   else if (op == MPI_PROD) *UnpackAndOp = link->UnpackAndMult;
68640e23c03SJunchao Zhang   else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->UnpackAndMax;
68740e23c03SJunchao Zhang   else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->UnpackAndMin;
68840e23c03SJunchao Zhang   else if (op == MPI_LAND)   *UnpackAndOp = link->UnpackAndLAND;
68940e23c03SJunchao Zhang   else if (op == MPI_BAND)   *UnpackAndOp = link->UnpackAndBAND;
69040e23c03SJunchao Zhang   else if (op == MPI_LOR)    *UnpackAndOp = link->UnpackAndLOR;
69140e23c03SJunchao Zhang   else if (op == MPI_BOR)    *UnpackAndOp = link->UnpackAndBOR;
69240e23c03SJunchao Zhang   else if (op == MPI_LXOR)   *UnpackAndOp = link->UnpackAndLXOR;
69340e23c03SJunchao Zhang   else if (op == MPI_BXOR)   *UnpackAndOp = link->UnpackAndBXOR;
69440e23c03SJunchao Zhang   else if (op == MPI_MAXLOC) *UnpackAndOp = link->UnpackAndMaxloc;
69540e23c03SJunchao Zhang   else if (op == MPI_MINLOC) *UnpackAndOp = link->UnpackAndMinloc;
69640e23c03SJunchao Zhang   else *UnpackAndOp = NULL;
69740e23c03SJunchao Zhang   PetscFunctionReturn(0);
69840e23c03SJunchao Zhang }
69940e23c03SJunchao Zhang 
700*b23bfdefSJunchao Zhang PetscErrorCode PetscSFPackGetFetchAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*))
70140e23c03SJunchao Zhang {
70240e23c03SJunchao Zhang   PetscFunctionBegin;
70340e23c03SJunchao Zhang   *FetchAndOp = NULL;
70440e23c03SJunchao Zhang   if (op == MPIU_REPLACE) *FetchAndOp = link->FetchAndInsert;
70540e23c03SJunchao Zhang   else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->FetchAndAdd;
70640e23c03SJunchao Zhang   else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->FetchAndMax;
70740e23c03SJunchao Zhang   else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->FetchAndMin;
70840e23c03SJunchao Zhang   else if (op == MPI_MAXLOC) *FetchAndOp = link->FetchAndMaxloc;
70940e23c03SJunchao Zhang   else if (op == MPI_MINLOC) *FetchAndOp = link->FetchAndMinloc;
71040e23c03SJunchao Zhang   else if (op == MPI_PROD)   *FetchAndOp = link->FetchAndMult;
71140e23c03SJunchao Zhang   else if (op == MPI_LAND)   *FetchAndOp = link->FetchAndLAND;
71240e23c03SJunchao Zhang   else if (op == MPI_BAND)   *FetchAndOp = link->FetchAndBAND;
71340e23c03SJunchao Zhang   else if (op == MPI_LOR)    *FetchAndOp = link->FetchAndLOR;
71440e23c03SJunchao Zhang   else if (op == MPI_BOR)    *FetchAndOp = link->FetchAndBOR;
71540e23c03SJunchao Zhang   else if (op == MPI_LXOR)   *FetchAndOp = link->FetchAndLXOR;
71640e23c03SJunchao Zhang   else if (op == MPI_BXOR)   *FetchAndOp = link->FetchAndBXOR;
71740e23c03SJunchao Zhang   else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for MPI_Op");
71840e23c03SJunchao Zhang   PetscFunctionReturn(0);
71940e23c03SJunchao Zhang }
72040e23c03SJunchao Zhang 
72140e23c03SJunchao Zhang /*
72240e23c03SJunchao Zhang   Setup pack/unpack optimization plans based on indice patterns available
72340e23c03SJunchao Zhang 
72440e23c03SJunchao Zhang    Input Parameters:
725*b23bfdefSJunchao Zhang   +  n       - Number of target ranks
726*b23bfdefSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] is not necessarily 0.
727*b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
72840e23c03SJunchao Zhang 
72940e23c03SJunchao Zhang    Output Parameters:
73040e23c03SJunchao Zhang   +  opt    - the optimization
73140e23c03SJunchao Zhang */
73240e23c03SJunchao Zhang PetscErrorCode PetscSFPackSetupOptimization(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
73340e23c03SJunchao Zhang {
73440e23c03SJunchao Zhang   PetscErrorCode ierr;
73540e23c03SJunchao Zhang   PetscInt       i,j,k,n_copies,tot_copies=0,step;
736*b23bfdefSJunchao Zhang   PetscBool      strided,optimized=PETSC_FALSE;
73740e23c03SJunchao Zhang   PetscSFPackOpt opt;
73840e23c03SJunchao Zhang 
73940e23c03SJunchao Zhang   PetscFunctionBegin;
740*b23bfdefSJunchao Zhang   if (!n) {
741*b23bfdefSJunchao Zhang     *out = NULL;
742*b23bfdefSJunchao Zhang     PetscFunctionReturn(0);
743*b23bfdefSJunchao Zhang   }
744*b23bfdefSJunchao Zhang 
74540e23c03SJunchao Zhang   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
746*b23bfdefSJunchao Zhang   ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr);
747*b23bfdefSJunchao Zhang   ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr);
748*b23bfdefSJunchao Zhang   if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/
749*b23bfdefSJunchao Zhang 
750*b23bfdefSJunchao Zhang   opt->n = n;
75140e23c03SJunchao Zhang 
75240e23c03SJunchao Zhang   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */
75340e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor */
75440e23c03SJunchao Zhang     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
75540e23c03SJunchao Zhang     n_copies = 1;
75640e23c03SJunchao Zhang     for (j=offset[i]; j<offset[i+1]-1; j++) {
75740e23c03SJunchao Zhang       if (idx[j]+1 != idx[j+1]) n_copies++;
75840e23c03SJunchao Zhang     }
75940e23c03SJunchao Zhang     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
76040e23c03SJunchao Zhang        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
76140e23c03SJunchao Zhang      */
76240e23c03SJunchao Zhang     if ((offset[i+1]-offset[i])/n_copies >= 32) {
763*b23bfdefSJunchao Zhang       opt->type[i] = PETSCSF_PACKOPT_MULTICOPY;
764*b23bfdefSJunchao Zhang       optimized    = PETSC_TRUE;
76540e23c03SJunchao Zhang       tot_copies  += n_copies;
76640e23c03SJunchao Zhang     }
76740e23c03SJunchao Zhang   }
76840e23c03SJunchao Zhang 
76940e23c03SJunchao Zhang   /* Setup memcpy plan for each contiguous piece */
77040e23c03SJunchao Zhang   k    = 0; /* k-th copy */
771*b23bfdefSJunchao Zhang   ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
772*b23bfdefSJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor */
773*b23bfdefSJunchao Zhang     if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) {
77440e23c03SJunchao Zhang       n_copies           = 1;
775*b23bfdefSJunchao Zhang       opt->copy_start[k] = offset[i] - offset[0];
77640e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
77740e23c03SJunchao Zhang         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
77840e23c03SJunchao Zhang           n_copies++;
779*b23bfdefSJunchao Zhang           opt->copy_start[k+1] = j-offset[0]+1;
780*b23bfdefSJunchao Zhang           opt->copy_length[k]  = opt->copy_start[k+1] - opt->copy_start[k];
78140e23c03SJunchao Zhang           k++;
78240e23c03SJunchao Zhang         }
78340e23c03SJunchao Zhang       }
78440e23c03SJunchao Zhang       /* Set copy length of the last copy for this target */
785*b23bfdefSJunchao Zhang       opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k];
78640e23c03SJunchao Zhang       k++;
78740e23c03SJunchao Zhang     }
788*b23bfdefSJunchao Zhang     /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */
78940e23c03SJunchao Zhang     opt->copy_offset[i+1] = k;
79040e23c03SJunchao Zhang   }
79140e23c03SJunchao Zhang 
79240e23c03SJunchao Zhang   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
79340e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each remote */
794*b23bfdefSJunchao Zhang     if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
79540e23c03SJunchao Zhang       strided = PETSC_TRUE;
79640e23c03SJunchao Zhang       step    = idx[offset[i]+1] - idx[offset[i]];
79740e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
79840e23c03SJunchao Zhang         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
79940e23c03SJunchao Zhang       }
80040e23c03SJunchao Zhang       if (strided) {
801*b23bfdefSJunchao Zhang         opt->type[i]         = PETSCSF_PACKOPT_STRIDE;
80240e23c03SJunchao Zhang         opt->stride_step[i]  = step;
80340e23c03SJunchao Zhang         opt->stride_n[i]     = offset[i+1] - offset[i];
804*b23bfdefSJunchao Zhang         optimized            = PETSC_TRUE;
80540e23c03SJunchao Zhang       }
80640e23c03SJunchao Zhang     }
80740e23c03SJunchao Zhang   }
808*b23bfdefSJunchao Zhang   /* If no rank gets optimized, free arrays to save memory */
809*b23bfdefSJunchao Zhang   if (!optimized) {
810*b23bfdefSJunchao Zhang     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
811*b23bfdefSJunchao Zhang     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
81240e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
81340e23c03SJunchao Zhang     *out = NULL;
81440e23c03SJunchao Zhang   } else *out = opt;
81540e23c03SJunchao Zhang   PetscFunctionReturn(0);
81640e23c03SJunchao Zhang }
81740e23c03SJunchao Zhang 
81840e23c03SJunchao Zhang PetscErrorCode PetscSFPackDestoryOptimization(PetscSFPackOpt *out)
81940e23c03SJunchao Zhang {
82040e23c03SJunchao Zhang   PetscErrorCode ierr;
82140e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
82240e23c03SJunchao Zhang 
82340e23c03SJunchao Zhang   PetscFunctionBegin;
82440e23c03SJunchao Zhang   if (opt) {
825*b23bfdefSJunchao Zhang     ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr);
826*b23bfdefSJunchao Zhang     ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
82740e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
82840e23c03SJunchao Zhang     *out = NULL;
82940e23c03SJunchao Zhang   }
83040e23c03SJunchao Zhang   PetscFunctionReturn(0);
83140e23c03SJunchao Zhang }
832