xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 40e23c03978c13ac666c3a9a3d12ba4068b9b111)
1*40e23c03SJunchao Zhang 
2*40e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
3*40e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4*40e23c03SJunchao Zhang 
5*40e23c03SJunchao Zhang /*
6*40e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
7*40e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
8*40e23c03SJunchao Zhang  */
9*40e23c03SJunchao Zhang 
10*40e23c03SJunchao Zhang #define CPPJoin2_exp(a,b)     a ## b
11*40e23c03SJunchao Zhang #define CPPJoin2(a,b)         CPPJoin2_exp(a,b)
12*40e23c03SJunchao Zhang #define CPPJoin3_exp_(a,b,c)  a ## b ## _ ## c
13*40e23c03SJunchao Zhang #define CPPJoin3_(a,b,c)      CPPJoin3_exp_(a,b,c)
14*40e23c03SJunchao Zhang 
15*40e23c03SJunchao Zhang #define EXECUTE(statement)    statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */
16*40e23c03SJunchao Zhang #define IGNORE(statement)     do {} while(0)
17*40e23c03SJunchao Zhang 
18*40e23c03SJunchao Zhang #define BINARY_OP(r,s,op,t)   do {(r) = (s) op (t);  } while(0)  /* binary ops in the middle such as +, *, && etc. */
19*40e23c03SJunchao Zhang #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0)  /* ops like a function, such as PetscMax, PetscMin */
20*40e23c03SJunchao Zhang #define LXOR_OP(r,s,op,t)     do {(r) = (!s) != (!t);} while(0)  /* logical exclusive OR */
21*40e23c03SJunchao Zhang #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0)
22*40e23c03SJunchao Zhang 
23*40e23c03SJunchao Zhang #define BlockType(type,count) CPPJoin3_(_blocktype_,type,count) /* typename for struct {type v[count];} */
24*40e23c03SJunchao Zhang #define PairType(type1,type2) CPPJoin3_(_pairtype_,type1,type2) /* typename for struct {type1 a; type2 b;} */
25*40e23c03SJunchao Zhang 
26*40e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
27*40e23c03SJunchao Zhang 
28*40e23c03SJunchao Zhang    Arguments of the macro:
29*40e23c03SJunchao Zhang    +type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30*40e23c03SJunchao Zhang    -BS        Block size for vectorization. It is a factor of bs.
31*40e23c03SJunchao Zhang 
32*40e23c03SJunchao Zhang    Arguments of the Pack routine:
33*40e23c03SJunchao Zhang    +n         Number of entries to pack. Each entry is of type 'unit'. Here the unit is the arg used in calls like PetscSFBcastBegin(sf,unit,..).
34*40e23c03SJunchao Zhang               If idx in not NULL, then n also indicates the number of indices in idx[]
35*40e23c03SJunchao Zhang    .bs        Number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int
36*40e23c03SJunchao Zhang    .idx       Indices of entries. NULL means contiguous indices [0,n)
37*40e23c03SJunchao Zhang    .r         Do packing for the r-th target processor
38*40e23c03SJunchao Zhang    .opt       Pack optimization plans. NULL means no plan.
39*40e23c03SJunchao Zhang    .unpacked  Address of the unpacked data
40*40e23c03SJunchao Zhang    -packed    Address of the packed data
41*40e23c03SJunchao Zhang  */
42*40e23c03SJunchao Zhang #define DEF_PackFunc(type,BS) \
43*40e23c03SJunchao Zhang   static PetscErrorCode CPPJoin3_(Pack_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,const void *unpacked,void *packed) { \
44*40e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                   \
45*40e23c03SJunchao Zhang     const type     *u = (const type*)unpacked,*u2;                                                         \
46*40e23c03SJunchao Zhang     type           *p = (type*)packed;                                                                     \
47*40e23c03SJunchao Zhang     PetscInt       i,j,k,l,step;                                                                           \
48*40e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                    \
49*40e23c03SJunchao Zhang     if (!idx) {  /* idx[] is contiguous */                                                                 \
50*40e23c03SJunchao Zhang       ierr = PetscMemcpy(p,u,sizeof(type)*bs*n);CHKERRQ(ierr);                                             \
51*40e23c03SJunchao Zhang     } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/                                   \
52*40e23c03SJunchao Zhang       for (i=0; i<n; i++)                                                                                  \
53*40e23c03SJunchao Zhang         for (j=0; j<bs; j+=BS)                                                                             \
54*40e23c03SJunchao Zhang           for (k=j; k<j+BS; k++)                                                                           \
55*40e23c03SJunchao Zhang             p[i*bs+k] = u[idx[i]*bs+k];                                                                    \
56*40e23c03SJunchao Zhang     } else { /* idx[] is optimized*/                                                                       \
57*40e23c03SJunchao Zhang       if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */             \
58*40e23c03SJunchao Zhang         for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
59*40e23c03SJunchao Zhang           l    = opt->copy_length[i]*bs; /* length in types */                                             \
60*40e23c03SJunchao Zhang           u2   = u + opt->copy_start[i]*bs;                                                                \
61*40e23c03SJunchao Zhang           ierr = PetscMemcpy(p,u2,l*sizeof(type));CHKERRQ(ierr);                                           \
62*40e23c03SJunchao Zhang           p   += l;                                                                                        \
63*40e23c03SJunchao Zhang         }                                                                                                  \
64*40e23c03SJunchao Zhang       } else { /* idx[] is strided */                                                                      \
65*40e23c03SJunchao Zhang         u   += opt->stride_first[r]*bs;                                                                    \
66*40e23c03SJunchao Zhang         step = opt->stride_step[r];                                                                        \
67*40e23c03SJunchao Zhang         for (i=0; i<opt->stride_n[r]; i++)                                                                 \
68*40e23c03SJunchao Zhang           for (j=0; j<bs; j++)                                                                             \
69*40e23c03SJunchao Zhang             p[i*bs+j] = u[i*step*bs+j];                                                                    \
70*40e23c03SJunchao Zhang       }                                                                                                    \
71*40e23c03SJunchao Zhang     }                                                                                                      \
72*40e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                \
73*40e23c03SJunchao Zhang   }
74*40e23c03SJunchao Zhang 
75*40e23c03SJunchao Zhang /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine
76*40e23c03SJunchao Zhang 
77*40e23c03SJunchao Zhang    Arguments:
78*40e23c03SJunchao Zhang   +action     Unpack or Fetch
79*40e23c03SJunchao Zhang   .type       Type of the data
80*40e23c03SJunchao Zhang   .BS         Block size for vectorization
81*40e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
82*40e23c03SJunchao Zhang   .ctype      Type with or without the const qualifier, i.e., const type or type
83*40e23c03SJunchao Zhang   .cvoid      void with or without the const qualifier, i.e., const void or void
84*40e23c03SJunchao Zhang 
85*40e23c03SJunchao Zhang   Notes:
86*40e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
87*40e23c03SJunchao Zhang    The two arguments ctype and cvoid are used (instead of one constness argument), because we want to
88*40e23c03SJunchao Zhang    get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument,
89*40e23c03SJunchao Zhang    sometimes we input 'const', sometimes we have to input empty.
90*40e23c03SJunchao Zhang  */
91*40e23c03SJunchao Zhang #define DEF_Action(action,type,BS,FILTER,ctype,cvoid)               \
92*40e23c03SJunchao Zhang   static PetscErrorCode CPPJoin3_(action##AndInsert_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \
93*40e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                   \
94*40e23c03SJunchao Zhang     type           *u = (type*)unpacked,*u2;                                                               \
95*40e23c03SJunchao Zhang     ctype          *p = (ctype*)packed;                                                                    \
96*40e23c03SJunchao Zhang     PetscInt       i,j,k,l,step;                                                                           \
97*40e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                    \
98*40e23c03SJunchao Zhang     if (!idx) {  /* idx[] is contiguous */                                                                 \
99*40e23c03SJunchao Zhang       FILTER(type *v);                                                                                     \
100*40e23c03SJunchao Zhang       FILTER(ierr = PetscMalloc1(bs*n,&v);CHKERRQ(ierr));                                                  \
101*40e23c03SJunchao Zhang       FILTER(ierr = PetscMemcpy(v,u,sizeof(type)*bs*n);CHKERRQ(ierr));                                     \
102*40e23c03SJunchao Zhang              ierr = PetscMemcpy(u,p,sizeof(type)*bs*n);CHKERRQ(ierr);                                      \
103*40e23c03SJunchao Zhang       FILTER(ierr = PetscMemcpy(p,v,sizeof(type)*bs*n);CHKERRQ(ierr));                                     \
104*40e23c03SJunchao Zhang       FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                           \
105*40e23c03SJunchao Zhang     } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/                                   \
106*40e23c03SJunchao Zhang       for (i=0; i<n; i++) {                                                                                \
107*40e23c03SJunchao Zhang         for (j=0; j<bs; j+=BS) {                                                                           \
108*40e23c03SJunchao Zhang           for (k=j; k<j+BS; k++) {                                                                         \
109*40e23c03SJunchao Zhang             FILTER(type t = u[idx[i]*bs+k]);                                                               \
110*40e23c03SJunchao Zhang             u[idx[i]*bs+k] = p[i*bs+k];                                                                    \
111*40e23c03SJunchao Zhang             FILTER(p[i*bs+k] = t);                                                                         \
112*40e23c03SJunchao Zhang           }                                                                                                \
113*40e23c03SJunchao Zhang         }                                                                                                  \
114*40e23c03SJunchao Zhang       }                                                                                                    \
115*40e23c03SJunchao Zhang     } else { /* idx[] is optimized*/                                                                       \
116*40e23c03SJunchao Zhang       if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */             \
117*40e23c03SJunchao Zhang         FILTER(type *v);                                                                                   \
118*40e23c03SJunchao Zhang         FILTER(ierr = PetscMalloc1(bs*n,&v);CHKERRQ(ierr)); /* maximal buffer  */                          \
119*40e23c03SJunchao Zhang         for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
120*40e23c03SJunchao Zhang           l  = opt->copy_length[i]*bs; /* length in types */                                               \
121*40e23c03SJunchao Zhang           u2 = u + opt->copy_start[i]*bs;                                                                  \
122*40e23c03SJunchao Zhang           FILTER(ierr = PetscMemcpy(v,u2,l*sizeof(type));CHKERRQ(ierr));                                   \
123*40e23c03SJunchao Zhang                  ierr = PetscMemcpy(u2,p,l*sizeof(type));CHKERRQ(ierr);                                    \
124*40e23c03SJunchao Zhang           FILTER(ierr = PetscMemcpy(p,v,l*sizeof(type));CHKERRQ(ierr));                                    \
125*40e23c03SJunchao Zhang           p += l;                                                                                          \
126*40e23c03SJunchao Zhang         }                                                                                                  \
127*40e23c03SJunchao Zhang         FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                         \
128*40e23c03SJunchao Zhang       } else { /* idx[] is strided */                                                                      \
129*40e23c03SJunchao Zhang         u   += opt->stride_first[r]*bs;                                                                    \
130*40e23c03SJunchao Zhang         step = opt->stride_step[r];                                                                        \
131*40e23c03SJunchao Zhang         for (i=0; i<opt->stride_n[r]; i++)                                                                 \
132*40e23c03SJunchao Zhang           for (j=0; j<bs; j++) {                                                                           \
133*40e23c03SJunchao Zhang             FILTER(type t = u[i*step*bs+j]);                                                               \
134*40e23c03SJunchao Zhang             u[i*step*bs+j] = p[i*bs+j];                                                                    \
135*40e23c03SJunchao Zhang             FILTER(p[i*bs+j] = t);                                                                         \
136*40e23c03SJunchao Zhang           }                                                                                                \
137*40e23c03SJunchao Zhang       }                                                                                                    \
138*40e23c03SJunchao Zhang     }                                                                                                      \
139*40e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                \
140*40e23c03SJunchao Zhang   }
141*40e23c03SJunchao Zhang 
142*40e23c03SJunchao Zhang /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc
143*40e23c03SJunchao Zhang 
144*40e23c03SJunchao Zhang    Arguments:
145*40e23c03SJunchao Zhang   +action     Unpack or Fetch
146*40e23c03SJunchao Zhang   .opname     Name of the Op, such as Add, Mult, LAND, etc.
147*40e23c03SJunchao Zhang   .type       Type of the data
148*40e23c03SJunchao Zhang   .BS         Block size for vectorization
149*40e23c03SJunchao Zhang   .op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
150*40e23c03SJunchao Zhang   .APPLY      Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP
151*40e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
152*40e23c03SJunchao Zhang   .ctype      Type with or without the const qualifier, i.e., const type or type
153*40e23c03SJunchao Zhang   -cvoid      void with or without the const qualifier, i.e., const void or void
154*40e23c03SJunchao Zhang  */
155*40e23c03SJunchao Zhang #define DEF_ActionAndOp(action,opname,type,BS,op,APPLY,FILTER,ctype,cvoid) \
156*40e23c03SJunchao Zhang   static PetscErrorCode CPPJoin3_(action##And##opname##_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \
157*40e23c03SJunchao Zhang     type     *u = (type*)unpacked,*u2,t;                                                                   \
158*40e23c03SJunchao Zhang     ctype    *p = (ctype*)packed;                                                                          \
159*40e23c03SJunchao Zhang     PetscInt i,j,k,l,step;                                                                                 \
160*40e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                    \
161*40e23c03SJunchao Zhang     if (!idx) {  /* idx[] is contiguous */                                                                 \
162*40e23c03SJunchao Zhang       for (i=0; i<n*bs; i++) {                                                                             \
163*40e23c03SJunchao Zhang         t = u[i];                                                                                          \
164*40e23c03SJunchao Zhang         APPLY(u[i],t,op,p[i]);                                                                             \
165*40e23c03SJunchao Zhang         FILTER(p[i] = t);                                                                                  \
166*40e23c03SJunchao Zhang       }                                                                                                    \
167*40e23c03SJunchao Zhang     } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/                                   \
168*40e23c03SJunchao Zhang       for (i=0; i<n; i++) {                                                                                \
169*40e23c03SJunchao Zhang         for (j=0; j<bs; j+=BS) {                                                                           \
170*40e23c03SJunchao Zhang           for (k=j; k<j+BS; k++) {                                                                         \
171*40e23c03SJunchao Zhang             t = u[idx[i]*bs+k];                                                                            \
172*40e23c03SJunchao Zhang             APPLY(u[idx[i]*bs+k],t,op,p[i*bs+k]);                                                          \
173*40e23c03SJunchao Zhang             FILTER(p[i*bs+k] = t);                                                                         \
174*40e23c03SJunchao Zhang           }                                                                                                \
175*40e23c03SJunchao Zhang         }                                                                                                  \
176*40e23c03SJunchao Zhang       }                                                                                                    \
177*40e23c03SJunchao Zhang     } else { /* idx[] is optimized*/                                                                       \
178*40e23c03SJunchao Zhang       if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */             \
179*40e23c03SJunchao Zhang         for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
180*40e23c03SJunchao Zhang           l  = opt->copy_length[i]*bs; /* length in types */                                               \
181*40e23c03SJunchao Zhang           u2 = u + opt->copy_start[i]*bs;                                                                  \
182*40e23c03SJunchao Zhang           for (j=0; j<l; j++) {                                                                            \
183*40e23c03SJunchao Zhang             t = u2[j];                                                                                     \
184*40e23c03SJunchao Zhang             APPLY(u2[j],t,op,p[j]);                                                                        \
185*40e23c03SJunchao Zhang             FILTER(p[j] = t);                                                                              \
186*40e23c03SJunchao Zhang           }                                                                                                \
187*40e23c03SJunchao Zhang           p += l;                                                                                          \
188*40e23c03SJunchao Zhang         }                                                                                                  \
189*40e23c03SJunchao Zhang       } else { /* idx[] is strided */                                                                      \
190*40e23c03SJunchao Zhang         u   += opt->stride_first[r]*bs;                                                                    \
191*40e23c03SJunchao Zhang         step = opt->stride_step[r];                                                                        \
192*40e23c03SJunchao Zhang         for (i=0; i<opt->stride_n[r]; i++)                                                                 \
193*40e23c03SJunchao Zhang           for (j=0; j<bs; j++) {                                                                           \
194*40e23c03SJunchao Zhang             t = u[i*step*bs+j];                                                                            \
195*40e23c03SJunchao Zhang             APPLY(u[i*step*bs+j],t,op,p[i*bs+j]);                                                          \
196*40e23c03SJunchao Zhang             FILTER(p[i*bs+j] = t);                                                                         \
197*40e23c03SJunchao Zhang           }                                                                                                \
198*40e23c03SJunchao Zhang       }                                                                                                    \
199*40e23c03SJunchao Zhang     }                                                                                                      \
200*40e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                \
201*40e23c03SJunchao Zhang   }
202*40e23c03SJunchao Zhang 
203*40e23c03SJunchao Zhang /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine
204*40e23c03SJunchao Zhang 
205*40e23c03SJunchao Zhang    Arguments:
206*40e23c03SJunchao Zhang   +Action     Unpack or Fetch
207*40e23c03SJunchao Zhang   .locname    Max or Min
208*40e23c03SJunchao Zhang   .type1      Type of the first data in a pair type
209*40e23c03SJunchao Zhang   .type2      Type of the second data in a pair type, usually PetscMPIInt for MPI ranks.
210*40e23c03SJunchao Zhang   .op         > or <
211*40e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
212*40e23c03SJunchao Zhang   .ctype      Type with or without the const qualifier, i.e., const PairType(type1,type2) or PairType(type1,type2)
213*40e23c03SJunchao Zhang   -cvoid      void with or without the const qualifier, i.e., const void or void
214*40e23c03SJunchao Zhang  */
215*40e23c03SJunchao Zhang #define DEF_ActionAndXloc(action,locname,type1,type2,op,FILTER,ctype,cvoid) \
216*40e23c03SJunchao Zhang   static PetscErrorCode CPPJoin3_(action##And##locname##loc_,PairType(type1,type2),1)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \
217*40e23c03SJunchao Zhang     PairType(type1,type2) *u = (PairType(type1,type2)*)unpacked;                                           \
218*40e23c03SJunchao Zhang     ctype                 *p = (ctype*)packed;                                                             \
219*40e23c03SJunchao Zhang     PetscInt              i;                                                                               \
220*40e23c03SJunchao Zhang     for (i=0; i<n; i++) {                                                                                  \
221*40e23c03SJunchao Zhang       PetscInt j = idx[i];                                                                                 \
222*40e23c03SJunchao Zhang       FILTER(PairType(type1,type2) v = u[j]);                                                              \
223*40e23c03SJunchao Zhang       if (p[i].a op u[j].a) {                                                                              \
224*40e23c03SJunchao Zhang         u[j] = p[i];                                                                                       \
225*40e23c03SJunchao Zhang       } else if (p[i].a == u[j].a) {                                                                       \
226*40e23c03SJunchao Zhang         u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */                               \
227*40e23c03SJunchao Zhang       }                                                                                                    \
228*40e23c03SJunchao Zhang       FILTER(p[i] = v);                                                                                    \
229*40e23c03SJunchao Zhang     }                                                                                                      \
230*40e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                \
231*40e23c03SJunchao Zhang   }
232*40e23c03SJunchao Zhang 
233*40e23c03SJunchao Zhang 
234*40e23c03SJunchao Zhang /* Pack/unpack/fetch ops for all types */
235*40e23c03SJunchao Zhang #define DEF_PackNoInit(type,BS)                                                         \
236*40e23c03SJunchao Zhang   DEF_PackFunc(type,BS)                                                                 \
237*40e23c03SJunchao Zhang   DEF_Action(Unpack,type,BS,IGNORE,const type,const void)                               \
238*40e23c03SJunchao Zhang   DEF_Action(Fetch, type,BS,EXECUTE,type,void)                                          \
239*40e23c03SJunchao Zhang 
240*40e23c03SJunchao Zhang 
241*40e23c03SJunchao Zhang /* Extra addition ops for types supporting them */
242*40e23c03SJunchao Zhang #define DEF_PackAddNoInit(type,BS)                                                      \
243*40e23c03SJunchao Zhang   DEF_PackNoInit(type,BS)                                                               \
244*40e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,Add, type,BS,+,BINARY_OP,IGNORE,const type,const void)         \
245*40e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,Mult,type,BS,*,BINARY_OP,IGNORE,const type,const void)         \
246*40e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, Add, type,BS,+,BINARY_OP,EXECUTE,type,void)                    \
247*40e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, Mult,type,BS,*,BINARY_OP,EXECUTE,type,void)
248*40e23c03SJunchao Zhang 
249*40e23c03SJunchao Zhang /* Basic types */
250*40e23c03SJunchao Zhang #define DEF_Pack(type,BS)                                                               \
251*40e23c03SJunchao Zhang   DEF_PackAddNoInit(type,BS)                                                            \
252*40e23c03SJunchao Zhang   static void CPPJoin3_(PackInit_,type,BS)(PetscSFPack link) {                          \
253*40e23c03SJunchao Zhang     link->Pack            = CPPJoin3_(Pack_,           type,BS);                        \
254*40e23c03SJunchao Zhang     link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,type,BS);                        \
255*40e23c03SJunchao Zhang     link->UnpackAndAdd    = CPPJoin3_(UnpackAndAdd_,   type,BS);                        \
256*40e23c03SJunchao Zhang     link->UnpackAndMult   = CPPJoin3_(UnpackAndMult_,  type,BS);                        \
257*40e23c03SJunchao Zhang     link->FetchAndInsert  = CPPJoin3_(FetchAndInsert_, type,BS);                        \
258*40e23c03SJunchao Zhang     link->FetchAndAdd     = CPPJoin3_(FetchAndAdd_,    type,BS);                        \
259*40e23c03SJunchao Zhang     link->FetchAndMult    = CPPJoin3_(FetchAndMult_,   type,BS);                        \
260*40e23c03SJunchao Zhang     link->unitbytes       = sizeof(type);                                               \
261*40e23c03SJunchao Zhang   }
262*40e23c03SJunchao Zhang 
263*40e23c03SJunchao Zhang /* Comparable types */
264*40e23c03SJunchao Zhang #define DEF_PackCmp(type)                                                               \
265*40e23c03SJunchao Zhang   DEF_PackAddNoInit(type,1)                                                             \
266*40e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,Max,type,1,PetscMax,FUNCTION_OP,IGNORE,const type,const void)  \
267*40e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,Min,type,1,PetscMin,FUNCTION_OP,IGNORE,const type,const void)  \
268*40e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, Max,type,1,PetscMax,FUNCTION_OP,EXECUTE,type,void)             \
269*40e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, Min,type,1,PetscMin,FUNCTION_OP,EXECUTE,type,void)             \
270*40e23c03SJunchao Zhang   static void CPPJoin2(PackInit_,type)(PetscSFPack link) {                              \
271*40e23c03SJunchao Zhang     link->Pack            = CPPJoin3_(Pack_,           type,1);                         \
272*40e23c03SJunchao Zhang     link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,type,1);                         \
273*40e23c03SJunchao Zhang     link->UnpackAndAdd    = CPPJoin3_(UnpackAndAdd_,   type,1);                         \
274*40e23c03SJunchao Zhang     link->UnpackAndMult   = CPPJoin3_(UnpackAndMult_,  type,1);                         \
275*40e23c03SJunchao Zhang     link->UnpackAndMax    = CPPJoin3_(UnpackAndMax_,   type,1);                         \
276*40e23c03SJunchao Zhang     link->UnpackAndMin    = CPPJoin3_(UnpackAndMin_,   type,1);                         \
277*40e23c03SJunchao Zhang     link->FetchAndInsert  = CPPJoin3_(FetchAndInsert_, type,1);                         \
278*40e23c03SJunchao Zhang     link->FetchAndAdd     = CPPJoin3_(FetchAndAdd_ ,   type,1);                         \
279*40e23c03SJunchao Zhang     link->FetchAndMult    = CPPJoin3_(FetchAndMult_,   type,1);                         \
280*40e23c03SJunchao Zhang     link->FetchAndMax     = CPPJoin3_(FetchAndMax_ ,   type,1);                         \
281*40e23c03SJunchao Zhang     link->FetchAndMin     = CPPJoin3_(FetchAndMin_ ,   type,1);                         \
282*40e23c03SJunchao Zhang     link->unitbytes       = sizeof(type);                                               \
283*40e23c03SJunchao Zhang   }
284*40e23c03SJunchao Zhang 
285*40e23c03SJunchao Zhang /* Logical Types */
286*40e23c03SJunchao Zhang /* The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid
287*40e23c03SJunchao Zhang    the compilation warning "empty macro arguments are undefined in ISO C90"
288*40e23c03SJunchao Zhang  */
289*40e23c03SJunchao Zhang #define DEF_PackLog(type)                                                               \
290*40e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,LAND,type,1,&&,BINARY_OP,IGNORE,const type,const void)         \
291*40e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,LOR, type,1,||,BINARY_OP,IGNORE,const type,const void)         \
292*40e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,LXOR,type,1,&, LXOR_OP,  IGNORE,const type,const void)         \
293*40e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, LAND,type,1,&&,BINARY_OP,EXECUTE,type,void)                    \
294*40e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, LOR, type,1,||,BINARY_OP,EXECUTE,type,void)                    \
295*40e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, LXOR,type,1,&, LXOR_OP,  EXECUTE,type,void)                    \
296*40e23c03SJunchao Zhang   static void CPPJoin2(PackInit_Logical_,type)(PetscSFPack link) {                      \
297*40e23c03SJunchao Zhang     link->UnpackAndLAND   = CPPJoin3_(UnpackAndLAND_,type,1);                           \
298*40e23c03SJunchao Zhang     link->UnpackAndLOR    = CPPJoin3_(UnpackAndLOR_, type,1);                           \
299*40e23c03SJunchao Zhang     link->UnpackAndLXOR   = CPPJoin3_(UnpackAndLXOR_,type,1);                           \
300*40e23c03SJunchao Zhang     link->FetchAndLAND    = CPPJoin3_(FetchAndLAND_, type,1);                           \
301*40e23c03SJunchao Zhang     link->FetchAndLOR     = CPPJoin3_(FetchAndLOR_,  type,1);                           \
302*40e23c03SJunchao Zhang     link->FetchAndLXOR    = CPPJoin3_(FetchAndLXOR_, type,1);                           \
303*40e23c03SJunchao Zhang   }
304*40e23c03SJunchao Zhang 
305*40e23c03SJunchao Zhang 
306*40e23c03SJunchao Zhang /* Bitwise Types */
307*40e23c03SJunchao Zhang #define DEF_PackBit(type)                                                               \
308*40e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,BAND,type,1,&,BINARY_OP,IGNORE,const type,const void)          \
309*40e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,BOR, type,1,|,BINARY_OP,IGNORE,const type,const void)          \
310*40e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,BXOR,type,1,^,BINARY_OP,IGNORE,const type,const void)          \
311*40e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, BAND,type,1,&,BINARY_OP,EXECUTE,type,void)                     \
312*40e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, BOR, type,1,|,BINARY_OP,EXECUTE,type,void)                     \
313*40e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, BXOR,type,1,^,BINARY_OP,EXECUTE,type,void)                     \
314*40e23c03SJunchao Zhang   static void CPPJoin2(PackInit_Bitwise_,type)(PetscSFPack link) {                      \
315*40e23c03SJunchao Zhang     link->UnpackAndBAND   = CPPJoin3_(UnpackAndBAND_,type,1);                           \
316*40e23c03SJunchao Zhang     link->UnpackAndBOR    = CPPJoin3_(UnpackAndBOR_, type,1);                           \
317*40e23c03SJunchao Zhang     link->UnpackAndBXOR   = CPPJoin3_(UnpackAndBXOR_,type,1);                           \
318*40e23c03SJunchao Zhang     link->FetchAndBAND    = CPPJoin3_(FetchAndBAND_, type,1);                           \
319*40e23c03SJunchao Zhang     link->FetchAndBOR     = CPPJoin3_(FetchAndBOR_,  type,1);                           \
320*40e23c03SJunchao Zhang     link->FetchAndBXOR    = CPPJoin3_(FetchAndBXOR_, type,1);                           \
321*40e23c03SJunchao Zhang   }
322*40e23c03SJunchao Zhang 
323*40e23c03SJunchao Zhang 
324*40e23c03SJunchao Zhang /* Pair types */
325*40e23c03SJunchao Zhang #define DEF_PackPair(type1,type2)                                                                                   \
326*40e23c03SJunchao Zhang   typedef struct {type1 a; type2 b;} PairType(type1,type2);                                                         \
327*40e23c03SJunchao Zhang   DEF_PackFunc(PairType(type1,type2),1)                                                                             \
328*40e23c03SJunchao Zhang   DEF_Action(Unpack,PairType(type1,type2),1,IGNORE,const PairType(type1,type2),const void)                          \
329*40e23c03SJunchao Zhang   DEF_Action(Fetch, PairType(type1,type2),1,EXECUTE,PairType(type1,type2),void)                                     \
330*40e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,Add,PairType(type1,type2),1,+,PAIRTYPE_OP,IGNORE,const PairType(type1,type2),const void)   \
331*40e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, Add,PairType(type1,type2),1,+,PAIRTYPE_OP,EXECUTE,PairType(type1,type2),void)              \
332*40e23c03SJunchao Zhang   DEF_ActionAndXloc(Unpack,Max,type1,type2,>,IGNORE,const PairType(type1,type2),const void)                         \
333*40e23c03SJunchao Zhang   DEF_ActionAndXloc(Unpack,Min,type1,type2,<,IGNORE,const PairType(type1,type2),const void)                         \
334*40e23c03SJunchao Zhang   DEF_ActionAndXloc(Fetch, Max,type1,type2,>,EXECUTE,PairType(type1,type2),void)                                    \
335*40e23c03SJunchao Zhang   DEF_ActionAndXloc(Fetch, Min,type1,type2,<,EXECUTE,PairType(type1,type2),void)                                    \
336*40e23c03SJunchao Zhang   static void CPPJoin3_(PackInit_,type1,type2)(PetscSFPack link) {                                                  \
337*40e23c03SJunchao Zhang     link->Pack            = CPPJoin3_(Pack_,           PairType(type1,type2),1);                                    \
338*40e23c03SJunchao Zhang     link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,PairType(type1,type2),1);                                    \
339*40e23c03SJunchao Zhang     link->UnpackAndAdd    = CPPJoin3_(UnpackAndAdd_,   PairType(type1,type2),1);                                    \
340*40e23c03SJunchao Zhang     link->UnpackAndMaxloc = CPPJoin3_(UnpackAndMaxloc_,PairType(type1,type2),1);                                    \
341*40e23c03SJunchao Zhang     link->UnpackAndMinloc = CPPJoin3_(UnpackAndMinloc_,PairType(type1,type2),1);                                    \
342*40e23c03SJunchao Zhang     link->FetchAndInsert  = CPPJoin3_(FetchAndInsert_, PairType(type1,type2),1);                                    \
343*40e23c03SJunchao Zhang     link->FetchAndAdd     = CPPJoin3_(FetchAndAdd_,    PairType(type1,type2),1);                                    \
344*40e23c03SJunchao Zhang     link->FetchAndMaxloc  = CPPJoin3_(FetchAndMaxloc_, PairType(type1,type2),1);                                    \
345*40e23c03SJunchao Zhang     link->FetchAndMinloc  = CPPJoin3_(FetchAndMinloc_, PairType(type1,type2),1);                                    \
346*40e23c03SJunchao Zhang     link->unitbytes       = sizeof(PairType(type1,type2));                                                          \
347*40e23c03SJunchao Zhang   }
348*40e23c03SJunchao Zhang 
349*40e23c03SJunchao Zhang 
350*40e23c03SJunchao Zhang /* Currently only dumb blocks of data */
351*40e23c03SJunchao Zhang #define DEF_Block(type,count)                                                           \
352*40e23c03SJunchao Zhang   typedef struct {type v[count];} BlockType(type,count);                                \
353*40e23c03SJunchao Zhang   DEF_PackNoInit(BlockType(type,count),1)                                               \
354*40e23c03SJunchao Zhang   static void CPPJoin3_(PackInit_block_,type,count)(PetscSFPack link) {                 \
355*40e23c03SJunchao Zhang     link->Pack            = CPPJoin3_(Pack_,           BlockType(type,count),1);        \
356*40e23c03SJunchao Zhang     link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,BlockType(type,count),1);        \
357*40e23c03SJunchao Zhang     link->FetchAndInsert  = CPPJoin3_(FetchAndInsert_, BlockType(type,count),1);        \
358*40e23c03SJunchao Zhang     link->unitbytes       = sizeof(BlockType(type,count));                              \
359*40e23c03SJunchao Zhang   }
360*40e23c03SJunchao Zhang 
361*40e23c03SJunchao Zhang /* The typedef is used to get a typename without space that CPPJoin can handle */
362*40e23c03SJunchao Zhang typedef signed char SignedChar;
363*40e23c03SJunchao Zhang typedef unsigned char UnsignedChar;
364*40e23c03SJunchao Zhang 
365*40e23c03SJunchao Zhang DEF_PackCmp(SignedChar)
366*40e23c03SJunchao Zhang DEF_PackBit(SignedChar)
367*40e23c03SJunchao Zhang DEF_PackLog(SignedChar)
368*40e23c03SJunchao Zhang DEF_PackCmp(UnsignedChar)
369*40e23c03SJunchao Zhang DEF_PackBit(UnsignedChar)
370*40e23c03SJunchao Zhang DEF_PackLog(UnsignedChar)
371*40e23c03SJunchao Zhang DEF_PackCmp(int)
372*40e23c03SJunchao Zhang DEF_PackBit(int)
373*40e23c03SJunchao Zhang DEF_PackLog(int)
374*40e23c03SJunchao Zhang DEF_PackCmp(PetscInt)
375*40e23c03SJunchao Zhang DEF_PackBit(PetscInt)
376*40e23c03SJunchao Zhang DEF_PackLog(PetscInt)
377*40e23c03SJunchao Zhang DEF_Pack(PetscInt,2)
378*40e23c03SJunchao Zhang DEF_Pack(PetscInt,3)
379*40e23c03SJunchao Zhang DEF_Pack(PetscInt,4)
380*40e23c03SJunchao Zhang DEF_Pack(PetscInt,5)
381*40e23c03SJunchao Zhang DEF_Pack(PetscInt,7)
382*40e23c03SJunchao Zhang DEF_PackCmp(PetscReal)
383*40e23c03SJunchao Zhang DEF_PackLog(PetscReal)
384*40e23c03SJunchao Zhang DEF_Pack(PetscReal,2)
385*40e23c03SJunchao Zhang DEF_Pack(PetscReal,3)
386*40e23c03SJunchao Zhang DEF_Pack(PetscReal,4)
387*40e23c03SJunchao Zhang DEF_Pack(PetscReal,5)
388*40e23c03SJunchao Zhang DEF_Pack(PetscReal,7)
389*40e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
390*40e23c03SJunchao Zhang DEF_Pack(PetscComplex,1)
391*40e23c03SJunchao Zhang DEF_Pack(PetscComplex,2)
392*40e23c03SJunchao Zhang DEF_Pack(PetscComplex,3)
393*40e23c03SJunchao Zhang DEF_Pack(PetscComplex,4)
394*40e23c03SJunchao Zhang DEF_Pack(PetscComplex,5)
395*40e23c03SJunchao Zhang DEF_Pack(PetscComplex,7)
396*40e23c03SJunchao Zhang #endif
397*40e23c03SJunchao Zhang DEF_PackPair(int,int)
398*40e23c03SJunchao Zhang DEF_PackPair(PetscInt,PetscInt)
399*40e23c03SJunchao Zhang DEF_Block(int,1)
400*40e23c03SJunchao Zhang DEF_Block(int,2)
401*40e23c03SJunchao Zhang DEF_Block(int,4)
402*40e23c03SJunchao Zhang DEF_Block(int,8)
403*40e23c03SJunchao Zhang DEF_Block(char,1)
404*40e23c03SJunchao Zhang DEF_Block(char,2)
405*40e23c03SJunchao Zhang DEF_Block(char,4)
406*40e23c03SJunchao Zhang 
407*40e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
408*40e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
409*40e23c03SJunchao Zhang {
410*40e23c03SJunchao Zhang   int ierr;
411*40e23c03SJunchao Zhang   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
412*40e23c03SJunchao Zhang   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
413*40e23c03SJunchao Zhang   return MPI_SUCCESS;
414*40e23c03SJunchao Zhang }
415*40e23c03SJunchao Zhang #endif
416*40e23c03SJunchao Zhang 
417*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *key,PetscCopyMode cmode,PetscSFPack *mylink)
418*40e23c03SJunchao Zhang {
419*40e23c03SJunchao Zhang   PetscErrorCode    ierr;
420*40e23c03SJunchao Zhang   PetscSFPack       link,*p;
421*40e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
422*40e23c03SJunchao Zhang 
423*40e23c03SJunchao Zhang   PetscFunctionBegin;
424*40e23c03SJunchao Zhang   /* Look for types in cache */
425*40e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
426*40e23c03SJunchao Zhang     PetscBool match;
427*40e23c03SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
428*40e23c03SJunchao Zhang     if (match && (key == link->key)) {
429*40e23c03SJunchao Zhang       switch (cmode) {
430*40e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
431*40e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
432*40e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
433*40e23c03SJunchao Zhang       }
434*40e23c03SJunchao Zhang       *mylink = link;
435*40e23c03SJunchao Zhang       PetscFunctionReturn(0);
436*40e23c03SJunchao Zhang     }
437*40e23c03SJunchao Zhang   }
438*40e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
439*40e23c03SJunchao Zhang   PetscFunctionReturn(0);
440*40e23c03SJunchao Zhang }
441*40e23c03SJunchao Zhang 
442*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link)
443*40e23c03SJunchao Zhang {
444*40e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
445*40e23c03SJunchao Zhang 
446*40e23c03SJunchao Zhang   PetscFunctionBegin;
447*40e23c03SJunchao Zhang   (*link)->key  = NULL;
448*40e23c03SJunchao Zhang   (*link)->next = bas->avail;
449*40e23c03SJunchao Zhang   bas->avail    = *link;
450*40e23c03SJunchao Zhang   *link         = NULL;
451*40e23c03SJunchao Zhang   PetscFunctionReturn(0);
452*40e23c03SJunchao Zhang }
453*40e23c03SJunchao Zhang 
454*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackSetupType(PetscSFPack link,MPI_Datatype unit)
455*40e23c03SJunchao Zhang {
456*40e23c03SJunchao Zhang   PetscErrorCode ierr;
457*40e23c03SJunchao Zhang   PetscBool      isInt,isPetscInt,isPetscReal,is2Int,is2PetscInt,isSignedChar,isUnsignedChar;
458*40e23c03SJunchao Zhang   PetscInt       nPetscIntContig,nPetscRealContig;
459*40e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
460*40e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
461*40e23c03SJunchao Zhang   PetscBool isPetscComplex;
462*40e23c03SJunchao Zhang   PetscInt nPetscComplexContig;
463*40e23c03SJunchao Zhang #endif
464*40e23c03SJunchao Zhang 
465*40e23c03SJunchao Zhang   PetscFunctionBegin;
466*40e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_SIGNED_CHAR,&isSignedChar);CHKERRQ(ierr);
467*40e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_UNSIGNED_CHAR,&isUnsignedChar);CHKERRQ(ierr);
468*40e23c03SJunchao Zhang   /* MPI_CHAR is treated below as a dumb block type that does not support reduction according to MPI standard */
469*40e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_INT,&isInt);CHKERRQ(ierr);
470*40e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_INT,&isPetscInt);CHKERRQ(ierr);
471*40e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT,&nPetscIntContig);CHKERRQ(ierr);
472*40e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_REAL,&isPetscReal);CHKERRQ(ierr);
473*40e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscRealContig);CHKERRQ(ierr);
474*40e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
475*40e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_COMPLEX,&isPetscComplex);CHKERRQ(ierr);
476*40e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplexContig);CHKERRQ(ierr);
477*40e23c03SJunchao Zhang #endif
478*40e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
479*40e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
480*40e23c03SJunchao Zhang   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
481*40e23c03SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE;
482*40e23c03SJunchao Zhang   link->bs = 1;
483*40e23c03SJunchao Zhang 
484*40e23c03SJunchao Zhang   if (isSignedChar) {PackInit_SignedChar(link); PackInit_Logical_SignedChar(link); PackInit_Bitwise_SignedChar(link); link->basicunit = MPI_SIGNED_CHAR;}
485*40e23c03SJunchao Zhang   else if (isUnsignedChar) {PackInit_UnsignedChar(link); PackInit_Logical_UnsignedChar(link); PackInit_Bitwise_UnsignedChar(link); link->basicunit = MPI_UNSIGNED_CHAR;}
486*40e23c03SJunchao Zhang   else if (isInt) {PackInit_int(link); PackInit_Logical_int(link); PackInit_Bitwise_int(link); link->basicunit = MPI_INT;}
487*40e23c03SJunchao Zhang   else if (isPetscInt) {PackInit_PetscInt(link); PackInit_Logical_PetscInt(link); PackInit_Bitwise_PetscInt(link); link->basicunit = MPIU_INT;}
488*40e23c03SJunchao Zhang   else if (isPetscReal) {PackInit_PetscReal(link); PackInit_Logical_PetscReal(link); link->basicunit = MPIU_REAL;}
489*40e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
490*40e23c03SJunchao Zhang   else if (isPetscComplex) {PackInit_PetscComplex_1(link); link->basicunit = MPIU_COMPLEX;}
491*40e23c03SJunchao Zhang #endif
492*40e23c03SJunchao Zhang   else if (is2Int) {PackInit_int_int(link); link->basicunit = MPI_2INT;}
493*40e23c03SJunchao Zhang   else if (is2PetscInt) {PackInit_PetscInt_PetscInt(link); link->basicunit = MPIU_2INT;}
494*40e23c03SJunchao Zhang   else if (nPetscIntContig) {
495*40e23c03SJunchao Zhang     if (nPetscIntContig%7 == 0) PackInit_PetscInt_7(link);
496*40e23c03SJunchao Zhang     else if (nPetscIntContig%5 == 0) PackInit_PetscInt_5(link);
497*40e23c03SJunchao Zhang     else if (nPetscIntContig%4 == 0) PackInit_PetscInt_4(link);
498*40e23c03SJunchao Zhang     else if (nPetscIntContig%3 == 0) PackInit_PetscInt_3(link);
499*40e23c03SJunchao Zhang     else if (nPetscIntContig%2 == 0) PackInit_PetscInt_2(link);
500*40e23c03SJunchao Zhang     else PackInit_PetscInt(link);
501*40e23c03SJunchao Zhang     link->bs = nPetscIntContig;
502*40e23c03SJunchao Zhang     link->unitbytes *= nPetscIntContig;
503*40e23c03SJunchao Zhang     link->basicunit = MPIU_INT;
504*40e23c03SJunchao Zhang   } else if (nPetscRealContig) {
505*40e23c03SJunchao Zhang     if (nPetscRealContig%7 == 0) PackInit_PetscReal_7(link);
506*40e23c03SJunchao Zhang     else if (nPetscRealContig%5 == 0) PackInit_PetscReal_5(link);
507*40e23c03SJunchao Zhang     else if (nPetscRealContig%4 == 0) PackInit_PetscReal_4(link);
508*40e23c03SJunchao Zhang     else if (nPetscRealContig%3 == 0) PackInit_PetscReal_3(link);
509*40e23c03SJunchao Zhang     else if (nPetscRealContig%2 == 0) PackInit_PetscReal_2(link);
510*40e23c03SJunchao Zhang     else PackInit_PetscReal(link);
511*40e23c03SJunchao Zhang     link->bs = nPetscRealContig;
512*40e23c03SJunchao Zhang     link->unitbytes *= nPetscRealContig;
513*40e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
514*40e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
515*40e23c03SJunchao Zhang   } else if (nPetscComplexContig) {
516*40e23c03SJunchao Zhang     if (nPetscComplexContig%7 == 0) PackInit_PetscComplex_7(link);
517*40e23c03SJunchao Zhang     else if (nPetscComplexContig%5 == 0) PackInit_PetscComplex_5(link);
518*40e23c03SJunchao Zhang     else if (nPetscComplexContig%4 == 0) PackInit_PetscComplex_4(link);
519*40e23c03SJunchao Zhang     else if (nPetscComplexContig%3 == 0) PackInit_PetscComplex_3(link);
520*40e23c03SJunchao Zhang     else if (nPetscComplexContig%2 == 0) PackInit_PetscComplex_2(link);
521*40e23c03SJunchao Zhang     else PackInit_PetscComplex_1(link);
522*40e23c03SJunchao Zhang     link->bs = nPetscComplexContig;
523*40e23c03SJunchao Zhang     link->unitbytes *= nPetscComplexContig;
524*40e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
525*40e23c03SJunchao Zhang #endif
526*40e23c03SJunchao Zhang   } else {
527*40e23c03SJunchao Zhang     MPI_Aint lb,bytes;
528*40e23c03SJunchao Zhang     ierr = MPI_Type_get_extent(unit,&lb,&bytes);CHKERRQ(ierr);
529*40e23c03SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
530*40e23c03SJunchao Zhang     if (bytes % sizeof(int)) { /* If the type size is not multiple of int */
531*40e23c03SJunchao Zhang       if      (bytes%4 == 0) {PackInit_block_char_4(link); link->bs = bytes/4;} /* Note the basic type is char[4] */
532*40e23c03SJunchao Zhang       else if (bytes%2 == 0) {PackInit_block_char_2(link); link->bs = bytes/2;}
533*40e23c03SJunchao Zhang       else                   {PackInit_block_char_1(link); link->bs = bytes/1;}
534*40e23c03SJunchao Zhang       link->unitbytes = bytes;
535*40e23c03SJunchao Zhang       link->basicunit = MPI_CHAR;
536*40e23c03SJunchao Zhang     } else {
537*40e23c03SJunchao Zhang       PetscInt nInt = bytes / sizeof(int);
538*40e23c03SJunchao Zhang       if      (nInt%8 == 0)  {PackInit_block_int_8(link);  link->bs = nInt/8;} /* Note the basic type is int[8] */
539*40e23c03SJunchao Zhang       else if (nInt%4 == 0)  {PackInit_block_int_4(link);  link->bs = nInt/4;}
540*40e23c03SJunchao Zhang       else if (nInt%2 == 0)  {PackInit_block_int_2(link);  link->bs = nInt/2;}
541*40e23c03SJunchao Zhang       else                   {PackInit_block_int_1(link);  link->bs = nInt/1;}
542*40e23c03SJunchao Zhang       link->unitbytes = bytes;
543*40e23c03SJunchao Zhang       link->basicunit = MPI_INT;
544*40e23c03SJunchao Zhang     }
545*40e23c03SJunchao Zhang   }
546*40e23c03SJunchao Zhang   if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */
547*40e23c03SJunchao Zhang   else {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
548*40e23c03SJunchao Zhang   PetscFunctionReturn(0);
549*40e23c03SJunchao Zhang }
550*40e23c03SJunchao Zhang 
551*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**UnpackAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*))
552*40e23c03SJunchao Zhang {
553*40e23c03SJunchao Zhang   PetscFunctionBegin;
554*40e23c03SJunchao Zhang   *UnpackAndOp = NULL;
555*40e23c03SJunchao Zhang   if (op == MPIU_REPLACE) *UnpackAndOp = link->UnpackAndInsert;
556*40e23c03SJunchao Zhang   else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->UnpackAndAdd;
557*40e23c03SJunchao Zhang   else if (op == MPI_PROD) *UnpackAndOp = link->UnpackAndMult;
558*40e23c03SJunchao Zhang   else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->UnpackAndMax;
559*40e23c03SJunchao Zhang   else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->UnpackAndMin;
560*40e23c03SJunchao Zhang   else if (op == MPI_LAND)   *UnpackAndOp = link->UnpackAndLAND;
561*40e23c03SJunchao Zhang   else if (op == MPI_BAND)   *UnpackAndOp = link->UnpackAndBAND;
562*40e23c03SJunchao Zhang   else if (op == MPI_LOR)    *UnpackAndOp = link->UnpackAndLOR;
563*40e23c03SJunchao Zhang   else if (op == MPI_BOR)    *UnpackAndOp = link->UnpackAndBOR;
564*40e23c03SJunchao Zhang   else if (op == MPI_LXOR)   *UnpackAndOp = link->UnpackAndLXOR;
565*40e23c03SJunchao Zhang   else if (op == MPI_BXOR)   *UnpackAndOp = link->UnpackAndBXOR;
566*40e23c03SJunchao Zhang   else if (op == MPI_MAXLOC) *UnpackAndOp = link->UnpackAndMaxloc;
567*40e23c03SJunchao Zhang   else if (op == MPI_MINLOC) *UnpackAndOp = link->UnpackAndMinloc;
568*40e23c03SJunchao Zhang   else *UnpackAndOp = NULL;
569*40e23c03SJunchao Zhang   PetscFunctionReturn(0);
570*40e23c03SJunchao Zhang }
571*40e23c03SJunchao Zhang 
572*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackGetFetchAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**FetchAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*))
573*40e23c03SJunchao Zhang {
574*40e23c03SJunchao Zhang   PetscFunctionBegin;
575*40e23c03SJunchao Zhang   *FetchAndOp = NULL;
576*40e23c03SJunchao Zhang   if (op == MPIU_REPLACE) *FetchAndOp = link->FetchAndInsert;
577*40e23c03SJunchao Zhang   else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->FetchAndAdd;
578*40e23c03SJunchao Zhang   else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->FetchAndMax;
579*40e23c03SJunchao Zhang   else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->FetchAndMin;
580*40e23c03SJunchao Zhang   else if (op == MPI_MAXLOC) *FetchAndOp = link->FetchAndMaxloc;
581*40e23c03SJunchao Zhang   else if (op == MPI_MINLOC) *FetchAndOp = link->FetchAndMinloc;
582*40e23c03SJunchao Zhang   else if (op == MPI_PROD)   *FetchAndOp = link->FetchAndMult;
583*40e23c03SJunchao Zhang   else if (op == MPI_LAND)   *FetchAndOp = link->FetchAndLAND;
584*40e23c03SJunchao Zhang   else if (op == MPI_BAND)   *FetchAndOp = link->FetchAndBAND;
585*40e23c03SJunchao Zhang   else if (op == MPI_LOR)    *FetchAndOp = link->FetchAndLOR;
586*40e23c03SJunchao Zhang   else if (op == MPI_BOR)    *FetchAndOp = link->FetchAndBOR;
587*40e23c03SJunchao Zhang   else if (op == MPI_LXOR)   *FetchAndOp = link->FetchAndLXOR;
588*40e23c03SJunchao Zhang   else if (op == MPI_BXOR)   *FetchAndOp = link->FetchAndBXOR;
589*40e23c03SJunchao Zhang   else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for MPI_Op");
590*40e23c03SJunchao Zhang   PetscFunctionReturn(0);
591*40e23c03SJunchao Zhang }
592*40e23c03SJunchao Zhang 
593*40e23c03SJunchao Zhang /*
594*40e23c03SJunchao Zhang   Setup pack/unpack optimization plans based on indice patterns available
595*40e23c03SJunchao Zhang 
596*40e23c03SJunchao Zhang    Input Parameters:
597*40e23c03SJunchao Zhang   +  n       - number of target processors
598*40e23c03SJunchao Zhang   .  offset  - [n+1] for the i-th processor, its associated indices are idx[offset[i], offset[i+1])
599*40e23c03SJunchao Zhang   -  idx     - [] array storing indices. Its length is offset[n+1]
600*40e23c03SJunchao Zhang 
601*40e23c03SJunchao Zhang    Output Parameters:
602*40e23c03SJunchao Zhang   +  opt    - the optimization
603*40e23c03SJunchao Zhang */
604*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackSetupOptimization(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
605*40e23c03SJunchao Zhang {
606*40e23c03SJunchao Zhang   PetscErrorCode ierr;
607*40e23c03SJunchao Zhang   PetscInt       i,j,k,n_copies,tot_copies=0,step;
608*40e23c03SJunchao Zhang   PetscBool      strided,has_strided=PETSC_FALSE,has_optimized=PETSC_FALSE;
609*40e23c03SJunchao Zhang   PetscSFPackOpt opt;
610*40e23c03SJunchao Zhang 
611*40e23c03SJunchao Zhang   PetscFunctionBegin;
612*40e23c03SJunchao Zhang   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
613*40e23c03SJunchao Zhang   ierr = PetscCalloc2(n,&opt->optimized,n+1,&opt->copy_offset);CHKERRQ(ierr);
614*40e23c03SJunchao Zhang 
615*40e23c03SJunchao Zhang   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */
616*40e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor */
617*40e23c03SJunchao Zhang     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
618*40e23c03SJunchao Zhang     n_copies = 1;
619*40e23c03SJunchao Zhang     for (j=offset[i]; j<offset[i+1]-1; j++) {
620*40e23c03SJunchao Zhang       if (idx[j]+1 != idx[j+1]) n_copies++;
621*40e23c03SJunchao Zhang     }
622*40e23c03SJunchao Zhang     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
623*40e23c03SJunchao Zhang        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
624*40e23c03SJunchao Zhang      */
625*40e23c03SJunchao Zhang     if ((offset[i+1]-offset[i])/n_copies >= 32) {
626*40e23c03SJunchao Zhang       opt->optimized[i] = PETSC_TRUE;
627*40e23c03SJunchao Zhang       has_optimized     = PETSC_TRUE;
628*40e23c03SJunchao Zhang       tot_copies       += n_copies;
629*40e23c03SJunchao Zhang     }
630*40e23c03SJunchao Zhang   }
631*40e23c03SJunchao Zhang 
632*40e23c03SJunchao Zhang   /* Setup memcpy plan for each contiguous piece */
633*40e23c03SJunchao Zhang   k    = 0; /* k-th copy */
634*40e23c03SJunchao Zhang   ierr = PetscMalloc2(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length);CHKERRQ(ierr);
635*40e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor procs[i] */
636*40e23c03SJunchao Zhang     if (opt->optimized[i]) {
637*40e23c03SJunchao Zhang       n_copies           = 1;
638*40e23c03SJunchao Zhang       opt->copy_start[k] = idx[offset[i]];
639*40e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
640*40e23c03SJunchao Zhang         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
641*40e23c03SJunchao Zhang           n_copies++;
642*40e23c03SJunchao Zhang           opt->copy_start[k+1] = idx[j+1];
643*40e23c03SJunchao Zhang           opt->copy_length[k]  = idx[j] - opt->copy_start[k] + 1;
644*40e23c03SJunchao Zhang           k++;
645*40e23c03SJunchao Zhang         }
646*40e23c03SJunchao Zhang       }
647*40e23c03SJunchao Zhang       /* Set copy length of the last copy for this target */
648*40e23c03SJunchao Zhang       opt->copy_length[k] = idx[j] - opt->copy_start[k] + 1;
649*40e23c03SJunchao Zhang       k++;
650*40e23c03SJunchao Zhang     }
651*40e23c03SJunchao Zhang     /* Set offset for next target. When optimized[i]=false, copy_offsets[i]=copy_offsets[i+1] */
652*40e23c03SJunchao Zhang     opt->copy_offset[i+1] = k;
653*40e23c03SJunchao Zhang   }
654*40e23c03SJunchao Zhang 
655*40e23c03SJunchao Zhang   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
656*40e23c03SJunchao Zhang   ierr = PetscMalloc3(n,&opt->stride_first,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
657*40e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each remote */
658*40e23c03SJunchao Zhang     if (!opt->optimized[i] && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
659*40e23c03SJunchao Zhang       strided = PETSC_TRUE;
660*40e23c03SJunchao Zhang       step    = idx[offset[i]+1] - idx[offset[i]];
661*40e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
662*40e23c03SJunchao Zhang         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
663*40e23c03SJunchao Zhang       }
664*40e23c03SJunchao Zhang       if (strided) {
665*40e23c03SJunchao Zhang         opt->optimized[i]    = PETSC_TRUE;
666*40e23c03SJunchao Zhang         opt->stride_first[i] = idx[offset[i]];
667*40e23c03SJunchao Zhang         opt->stride_step[i]  = step;
668*40e23c03SJunchao Zhang         opt->stride_n[i]     = offset[i+1] - offset[i];
669*40e23c03SJunchao Zhang         has_strided          = PETSC_TRUE;
670*40e23c03SJunchao Zhang         has_optimized        = PETSC_TRUE;
671*40e23c03SJunchao Zhang       }
672*40e23c03SJunchao Zhang     }
673*40e23c03SJunchao Zhang   }
674*40e23c03SJunchao Zhang   /* If no target has been stride-optimized or optimized, free related arrays to save memory */
675*40e23c03SJunchao Zhang   if (!has_strided) {ierr = PetscFree3(opt->stride_first,opt->stride_step,opt->stride_n);CHKERRQ(ierr);}
676*40e23c03SJunchao Zhang   if (!has_optimized) {
677*40e23c03SJunchao Zhang     ierr = PetscFree2(opt->optimized,opt->copy_offset);CHKERRQ(ierr);
678*40e23c03SJunchao Zhang     ierr = PetscFree2(opt->copy_start,opt->copy_length);CHKERRQ(ierr);
679*40e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
680*40e23c03SJunchao Zhang     *out = NULL;
681*40e23c03SJunchao Zhang   } else *out = opt;
682*40e23c03SJunchao Zhang   PetscFunctionReturn(0);
683*40e23c03SJunchao Zhang }
684*40e23c03SJunchao Zhang 
685*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackDestoryOptimization(PetscSFPackOpt *out)
686*40e23c03SJunchao Zhang {
687*40e23c03SJunchao Zhang   PetscErrorCode ierr;
688*40e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
689*40e23c03SJunchao Zhang 
690*40e23c03SJunchao Zhang   PetscFunctionBegin;
691*40e23c03SJunchao Zhang   if (opt) {
692*40e23c03SJunchao Zhang     ierr = PetscFree2(opt->optimized,opt->copy_offset);CHKERRQ(ierr);
693*40e23c03SJunchao Zhang     ierr = PetscFree2(opt->copy_start,opt->copy_length);CHKERRQ(ierr);
694*40e23c03SJunchao Zhang     ierr = PetscFree3(opt->stride_first,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
695*40e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
696*40e23c03SJunchao Zhang     *out = NULL;
697*40e23c03SJunchao Zhang   }
698*40e23c03SJunchao Zhang   PetscFunctionReturn(0);
699*40e23c03SJunchao Zhang }
700