xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 9d1c8add3f910d1f6c9c3b06f1a2a7ffcd567655)
140e23c03SJunchao Zhang 
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
540e23c03SJunchao Zhang /*
640e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
740e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
840e23c03SJunchao Zhang  */
940e23c03SJunchao Zhang 
1040e23c03SJunchao Zhang #define CPPJoin2_exp(a,b)     a ## b
1140e23c03SJunchao Zhang #define CPPJoin2(a,b)         CPPJoin2_exp(a,b)
1240e23c03SJunchao Zhang #define CPPJoin3_exp_(a,b,c)  a ## b ## _ ## c
1340e23c03SJunchao Zhang #define CPPJoin3_(a,b,c)      CPPJoin3_exp_(a,b,c)
1440e23c03SJunchao Zhang 
1540e23c03SJunchao Zhang #define EXECUTE(statement)    statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */
1640e23c03SJunchao Zhang #define IGNORE(statement)     do {} while(0)
1740e23c03SJunchao Zhang 
1840e23c03SJunchao Zhang #define BINARY_OP(r,s,op,t)   do {(r) = (s) op (t);  } while(0)  /* binary ops in the middle such as +, *, && etc. */
1940e23c03SJunchao Zhang #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0)  /* ops like a function, such as PetscMax, PetscMin */
2040e23c03SJunchao Zhang #define LXOR_OP(r,s,op,t)     do {(r) = (!s) != (!t);} while(0)  /* logical exclusive OR */
2140e23c03SJunchao Zhang #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0)
2240e23c03SJunchao Zhang 
2340e23c03SJunchao Zhang #define BlockType(type,count) CPPJoin3_(_blocktype_,type,count) /* typename for struct {type v[count];} */
2440e23c03SJunchao Zhang #define PairType(type1,type2) CPPJoin3_(_pairtype_,type1,type2) /* typename for struct {type1 a; type2 b;} */
2540e23c03SJunchao Zhang 
2640e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
2740e23c03SJunchao Zhang 
2840e23c03SJunchao Zhang    Arguments of the macro:
2940e23c03SJunchao Zhang    +type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
3040e23c03SJunchao Zhang    -BS        Block size for vectorization. It is a factor of bs.
3140e23c03SJunchao Zhang 
3240e23c03SJunchao Zhang    Arguments of the Pack routine:
3340e23c03SJunchao Zhang    +n         Number of entries to pack. Each entry is of type 'unit'. Here the unit is the arg used in calls like PetscSFBcastBegin(sf,unit,..).
3440e23c03SJunchao Zhang               If idx in not NULL, then n also indicates the number of indices in idx[]
3540e23c03SJunchao Zhang    .bs        Number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int
3640e23c03SJunchao Zhang    .idx       Indices of entries. NULL means contiguous indices [0,n)
3740e23c03SJunchao Zhang    .r         Do packing for the r-th target processor
3840e23c03SJunchao Zhang    .opt       Pack optimization plans. NULL means no plan.
3940e23c03SJunchao Zhang    .unpacked  Address of the unpacked data
4040e23c03SJunchao Zhang    -packed    Address of the packed data
4140e23c03SJunchao Zhang  */
4240e23c03SJunchao Zhang #define DEF_PackFunc(type,BS) \
4340e23c03SJunchao Zhang   static PetscErrorCode CPPJoin3_(Pack_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,const void *unpacked,void *packed) { \
4440e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                   \
4540e23c03SJunchao Zhang     const type     *u = (const type*)unpacked,*u2;                                                         \
4640e23c03SJunchao Zhang     type           *p = (type*)packed;                                                                     \
4740e23c03SJunchao Zhang     PetscInt       i,j,k,l,step;                                                                           \
4840e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                    \
4940e23c03SJunchao Zhang     if (!idx) {  /* idx[] is contiguous */                                                                 \
50da2e4c71SJunchao Zhang       ierr = PetscArraycpy(p,u,bs*n);CHKERRQ(ierr);                                             \
5140e23c03SJunchao Zhang     } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/                                   \
5240e23c03SJunchao Zhang       for (i=0; i<n; i++)                                                                                  \
5340e23c03SJunchao Zhang         for (j=0; j<bs; j+=BS)                                                                             \
5440e23c03SJunchao Zhang           for (k=j; k<j+BS; k++)                                                                           \
5540e23c03SJunchao Zhang             p[i*bs+k] = u[idx[i]*bs+k];                                                                    \
5640e23c03SJunchao Zhang     } else { /* idx[] is optimized*/                                                                       \
5740e23c03SJunchao Zhang       if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */             \
5840e23c03SJunchao Zhang         for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) {                                        \
5940e23c03SJunchao Zhang           l    = opt->copy_length[i]*bs; /* length in types */                                             \
6040e23c03SJunchao Zhang           u2   = u + opt->copy_start[i]*bs;                                                                \
61da2e4c71SJunchao Zhang           ierr = PetscArraycpy(p,u2,l);CHKERRQ(ierr);                                           \
6240e23c03SJunchao Zhang           p   += l;                                                                                        \
6340e23c03SJunchao Zhang         }                                                                                                  \
6440e23c03SJunchao Zhang       } else { /* idx[] is strided */                                                                      \
6540e23c03SJunchao Zhang         u   += opt->stride_first[r]*bs;                                                                    \
6640e23c03SJunchao Zhang         step = opt->stride_step[r];                                                                        \
6740e23c03SJunchao Zhang         for (i=0; i<opt->stride_n[r]; i++)                                                                 \
6840e23c03SJunchao Zhang           for (j=0; j<bs; j++)                                                                             \
6940e23c03SJunchao Zhang             p[i*bs+j] = u[i*step*bs+j];                                                                    \
7040e23c03SJunchao Zhang       }                                                                                                    \
7140e23c03SJunchao Zhang     }                                                                                                      \
7240e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                \
7340e23c03SJunchao Zhang   }
7440e23c03SJunchao Zhang 
7540e23c03SJunchao Zhang /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine
7640e23c03SJunchao Zhang 
7740e23c03SJunchao Zhang    Arguments:
7840e23c03SJunchao Zhang   +action     Unpack or Fetch
7940e23c03SJunchao Zhang   .type       Type of the data
8040e23c03SJunchao Zhang   .BS         Block size for vectorization
8140e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
8240e23c03SJunchao Zhang   .ctype      Type with or without the const qualifier, i.e., const type or type
8340e23c03SJunchao Zhang   .cvoid      void with or without the const qualifier, i.e., const void or void
8440e23c03SJunchao Zhang 
8540e23c03SJunchao Zhang   Notes:
8640e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
8740e23c03SJunchao Zhang    The two arguments ctype and cvoid are used (instead of one constness argument), because we want to
8840e23c03SJunchao Zhang    get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument,
8940e23c03SJunchao Zhang    sometimes we input 'const', sometimes we have to input empty.
9040e23c03SJunchao Zhang  */
9140e23c03SJunchao Zhang #define DEF_Action(action,type,BS,FILTER,ctype,cvoid)               \
9240e23c03SJunchao Zhang   static PetscErrorCode CPPJoin3_(action##AndInsert_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \
9340e23c03SJunchao Zhang     PetscErrorCode ierr;                                                                                   \
9440e23c03SJunchao Zhang     type           *u = (type*)unpacked,*u2;                                                               \
9540e23c03SJunchao Zhang     ctype          *p = (ctype*)packed;                                                                    \
9640e23c03SJunchao Zhang     PetscInt       i,j,k,l,step;                                                                           \
9740e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                    \
9840e23c03SJunchao Zhang     if (!idx) {  /* idx[] is contiguous */                                                                 \
9940e23c03SJunchao Zhang       FILTER(type *v);                                                                                     \
10040e23c03SJunchao Zhang       FILTER(ierr = PetscMalloc1(bs*n,&v);CHKERRQ(ierr));                                                  \
101da2e4c71SJunchao Zhang       FILTER(ierr = PetscArraycpy(v,u,bs*n);CHKERRQ(ierr));                                     \
102da2e4c71SJunchao Zhang              ierr = PetscArraycpy(u,p,bs*n);CHKERRQ(ierr);                                      \
103da2e4c71SJunchao Zhang       FILTER(ierr = PetscArraycpy(p,v,bs*n);CHKERRQ(ierr));                                     \
10440e23c03SJunchao Zhang       FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                           \
10540e23c03SJunchao Zhang     } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/                                   \
10640e23c03SJunchao Zhang       for (i=0; i<n; i++) {                                                                                \
10740e23c03SJunchao Zhang         for (j=0; j<bs; j+=BS) {                                                                           \
10840e23c03SJunchao Zhang           for (k=j; k<j+BS; k++) {                                                                         \
10940e23c03SJunchao Zhang             FILTER(type t = u[idx[i]*bs+k]);                                                               \
11040e23c03SJunchao Zhang             u[idx[i]*bs+k] = p[i*bs+k];                                                                    \
11140e23c03SJunchao Zhang             FILTER(p[i*bs+k] = t);                                                                         \
11240e23c03SJunchao Zhang           }                                                                                                \
11340e23c03SJunchao Zhang         }                                                                                                  \
11440e23c03SJunchao Zhang       }                                                                                                    \
11540e23c03SJunchao Zhang     } else { /* idx[] is optimized*/                                                                       \
11640e23c03SJunchao Zhang       if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */             \
11740e23c03SJunchao Zhang         FILTER(type *v);                                                                                   \
11840e23c03SJunchao Zhang         FILTER(ierr = PetscMalloc1(bs*n,&v);CHKERRQ(ierr)); /* maximal buffer  */                          \
11940e23c03SJunchao Zhang         for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
12040e23c03SJunchao Zhang           l  = opt->copy_length[i]*bs; /* length in types */                                               \
12140e23c03SJunchao Zhang           u2 = u + opt->copy_start[i]*bs;                                                                  \
122da2e4c71SJunchao Zhang           FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr));                                   \
123da2e4c71SJunchao Zhang                  ierr = PetscArraycpy(u2,p,l);CHKERRQ(ierr);                                    \
124da2e4c71SJunchao Zhang           FILTER(ierr = PetscArraycpy(p,v,l);CHKERRQ(ierr));                                    \
12540e23c03SJunchao Zhang           p += l;                                                                                          \
12640e23c03SJunchao Zhang         }                                                                                                  \
12740e23c03SJunchao Zhang         FILTER(ierr = PetscFree(v);CHKERRQ(ierr));                                                         \
12840e23c03SJunchao Zhang       } else { /* idx[] is strided */                                                                      \
12940e23c03SJunchao Zhang         u   += opt->stride_first[r]*bs;                                                                    \
13040e23c03SJunchao Zhang         step = opt->stride_step[r];                                                                        \
13140e23c03SJunchao Zhang         for (i=0; i<opt->stride_n[r]; i++)                                                                 \
13240e23c03SJunchao Zhang           for (j=0; j<bs; j++) {                                                                           \
13340e23c03SJunchao Zhang             FILTER(type t = u[i*step*bs+j]);                                                               \
13440e23c03SJunchao Zhang             u[i*step*bs+j] = p[i*bs+j];                                                                    \
13540e23c03SJunchao Zhang             FILTER(p[i*bs+j] = t);                                                                         \
13640e23c03SJunchao Zhang           }                                                                                                \
13740e23c03SJunchao Zhang       }                                                                                                    \
13840e23c03SJunchao Zhang     }                                                                                                      \
13940e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                \
14040e23c03SJunchao Zhang   }
14140e23c03SJunchao Zhang 
14240e23c03SJunchao Zhang /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc
14340e23c03SJunchao Zhang 
14440e23c03SJunchao Zhang    Arguments:
14540e23c03SJunchao Zhang   +action     Unpack or Fetch
14640e23c03SJunchao Zhang   .opname     Name of the Op, such as Add, Mult, LAND, etc.
14740e23c03SJunchao Zhang   .type       Type of the data
14840e23c03SJunchao Zhang   .BS         Block size for vectorization
14940e23c03SJunchao Zhang   .op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
15040e23c03SJunchao Zhang   .APPLY      Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP
15140e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
15240e23c03SJunchao Zhang   .ctype      Type with or without the const qualifier, i.e., const type or type
15340e23c03SJunchao Zhang   -cvoid      void with or without the const qualifier, i.e., const void or void
15440e23c03SJunchao Zhang  */
15540e23c03SJunchao Zhang #define DEF_ActionAndOp(action,opname,type,BS,op,APPLY,FILTER,ctype,cvoid) \
15640e23c03SJunchao Zhang   static PetscErrorCode CPPJoin3_(action##And##opname##_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \
15740e23c03SJunchao Zhang     type     *u = (type*)unpacked,*u2,t;                                                                   \
15840e23c03SJunchao Zhang     ctype    *p = (ctype*)packed;                                                                          \
15940e23c03SJunchao Zhang     PetscInt i,j,k,l,step;                                                                                 \
16040e23c03SJunchao Zhang     PetscFunctionBegin;                                                                                    \
16140e23c03SJunchao Zhang     if (!idx) {  /* idx[] is contiguous */                                                                 \
16240e23c03SJunchao Zhang       for (i=0; i<n*bs; i++) {                                                                             \
16340e23c03SJunchao Zhang         t = u[i];                                                                                          \
16440e23c03SJunchao Zhang         APPLY(u[i],t,op,p[i]);                                                                             \
16540e23c03SJunchao Zhang         FILTER(p[i] = t);                                                                                  \
16640e23c03SJunchao Zhang       }                                                                                                    \
16740e23c03SJunchao Zhang     } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/                                   \
16840e23c03SJunchao Zhang       for (i=0; i<n; i++) {                                                                                \
16940e23c03SJunchao Zhang         for (j=0; j<bs; j+=BS) {                                                                           \
17040e23c03SJunchao Zhang           for (k=j; k<j+BS; k++) {                                                                         \
17140e23c03SJunchao Zhang             t = u[idx[i]*bs+k];                                                                            \
17240e23c03SJunchao Zhang             APPLY(u[idx[i]*bs+k],t,op,p[i*bs+k]);                                                          \
17340e23c03SJunchao Zhang             FILTER(p[i*bs+k] = t);                                                                         \
17440e23c03SJunchao Zhang           }                                                                                                \
17540e23c03SJunchao Zhang         }                                                                                                  \
17640e23c03SJunchao Zhang       }                                                                                                    \
17740e23c03SJunchao Zhang     } else { /* idx[] is optimized*/                                                                       \
17840e23c03SJunchao Zhang       if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */             \
17940e23c03SJunchao Zhang         for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */                       \
18040e23c03SJunchao Zhang           l  = opt->copy_length[i]*bs; /* length in types */                                               \
18140e23c03SJunchao Zhang           u2 = u + opt->copy_start[i]*bs;                                                                  \
18240e23c03SJunchao Zhang           for (j=0; j<l; j++) {                                                                            \
18340e23c03SJunchao Zhang             t = u2[j];                                                                                     \
18440e23c03SJunchao Zhang             APPLY(u2[j],t,op,p[j]);                                                                        \
18540e23c03SJunchao Zhang             FILTER(p[j] = t);                                                                              \
18640e23c03SJunchao Zhang           }                                                                                                \
18740e23c03SJunchao Zhang           p += l;                                                                                          \
18840e23c03SJunchao Zhang         }                                                                                                  \
18940e23c03SJunchao Zhang       } else { /* idx[] is strided */                                                                      \
19040e23c03SJunchao Zhang         u   += opt->stride_first[r]*bs;                                                                    \
19140e23c03SJunchao Zhang         step = opt->stride_step[r];                                                                        \
19240e23c03SJunchao Zhang         for (i=0; i<opt->stride_n[r]; i++)                                                                 \
19340e23c03SJunchao Zhang           for (j=0; j<bs; j++) {                                                                           \
19440e23c03SJunchao Zhang             t = u[i*step*bs+j];                                                                            \
19540e23c03SJunchao Zhang             APPLY(u[i*step*bs+j],t,op,p[i*bs+j]);                                                          \
19640e23c03SJunchao Zhang             FILTER(p[i*bs+j] = t);                                                                         \
19740e23c03SJunchao Zhang           }                                                                                                \
19840e23c03SJunchao Zhang       }                                                                                                    \
19940e23c03SJunchao Zhang     }                                                                                                      \
20040e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                \
20140e23c03SJunchao Zhang   }
20240e23c03SJunchao Zhang 
20340e23c03SJunchao Zhang /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine
20440e23c03SJunchao Zhang 
20540e23c03SJunchao Zhang    Arguments:
20640e23c03SJunchao Zhang   +Action     Unpack or Fetch
20740e23c03SJunchao Zhang   .locname    Max or Min
20840e23c03SJunchao Zhang   .type1      Type of the first data in a pair type
20940e23c03SJunchao Zhang   .type2      Type of the second data in a pair type, usually PetscMPIInt for MPI ranks.
21040e23c03SJunchao Zhang   .op         > or <
21140e23c03SJunchao Zhang   .FILTER     Macro defining what to do with a statement, either EXECUTE or IGNORE
21240e23c03SJunchao Zhang   .ctype      Type with or without the const qualifier, i.e., const PairType(type1,type2) or PairType(type1,type2)
21340e23c03SJunchao Zhang   -cvoid      void with or without the const qualifier, i.e., const void or void
21440e23c03SJunchao Zhang  */
21540e23c03SJunchao Zhang #define DEF_ActionAndXloc(action,locname,type1,type2,op,FILTER,ctype,cvoid) \
21640e23c03SJunchao Zhang   static PetscErrorCode CPPJoin3_(action##And##locname##loc_,PairType(type1,type2),1)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \
21740e23c03SJunchao Zhang     PairType(type1,type2) *u = (PairType(type1,type2)*)unpacked;                                           \
21840e23c03SJunchao Zhang     ctype                 *p = (ctype*)packed;                                                             \
21940e23c03SJunchao Zhang     PetscInt              i;                                                                               \
22040e23c03SJunchao Zhang     for (i=0; i<n; i++) {                                                                                  \
22140e23c03SJunchao Zhang       PetscInt j = idx[i];                                                                                 \
22240e23c03SJunchao Zhang       FILTER(PairType(type1,type2) v = u[j]);                                                              \
22340e23c03SJunchao Zhang       if (p[i].a op u[j].a) {                                                                              \
22440e23c03SJunchao Zhang         u[j] = p[i];                                                                                       \
22540e23c03SJunchao Zhang       } else if (p[i].a == u[j].a) {                                                                       \
22640e23c03SJunchao Zhang         u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */                               \
22740e23c03SJunchao Zhang       }                                                                                                    \
22840e23c03SJunchao Zhang       FILTER(p[i] = v);                                                                                    \
22940e23c03SJunchao Zhang     }                                                                                                      \
23040e23c03SJunchao Zhang     PetscFunctionReturn(0);                                                                                \
23140e23c03SJunchao Zhang   }
23240e23c03SJunchao Zhang 
23340e23c03SJunchao Zhang 
23440e23c03SJunchao Zhang /* Pack/unpack/fetch ops for all types */
23540e23c03SJunchao Zhang #define DEF_PackNoInit(type,BS)                                                         \
23640e23c03SJunchao Zhang   DEF_PackFunc(type,BS)                                                                 \
23740e23c03SJunchao Zhang   DEF_Action(Unpack,type,BS,IGNORE,const type,const void)                               \
23840e23c03SJunchao Zhang   DEF_Action(Fetch, type,BS,EXECUTE,type,void)                                          \
23940e23c03SJunchao Zhang 
24040e23c03SJunchao Zhang 
24140e23c03SJunchao Zhang /* Extra addition ops for types supporting them */
24240e23c03SJunchao Zhang #define DEF_PackAddNoInit(type,BS)                                                      \
24340e23c03SJunchao Zhang   DEF_PackNoInit(type,BS)                                                               \
24440e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,Add, type,BS,+,BINARY_OP,IGNORE,const type,const void)         \
24540e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,Mult,type,BS,*,BINARY_OP,IGNORE,const type,const void)         \
24640e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, Add, type,BS,+,BINARY_OP,EXECUTE,type,void)                    \
24740e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, Mult,type,BS,*,BINARY_OP,EXECUTE,type,void)
24840e23c03SJunchao Zhang 
24940e23c03SJunchao Zhang /* Basic types */
25040e23c03SJunchao Zhang #define DEF_Pack(type,BS)                                                               \
25140e23c03SJunchao Zhang   DEF_PackAddNoInit(type,BS)                                                            \
25240e23c03SJunchao Zhang   static void CPPJoin3_(PackInit_,type,BS)(PetscSFPack link) {                          \
25340e23c03SJunchao Zhang     link->Pack            = CPPJoin3_(Pack_,           type,BS);                        \
25440e23c03SJunchao Zhang     link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,type,BS);                        \
25540e23c03SJunchao Zhang     link->UnpackAndAdd    = CPPJoin3_(UnpackAndAdd_,   type,BS);                        \
25640e23c03SJunchao Zhang     link->UnpackAndMult   = CPPJoin3_(UnpackAndMult_,  type,BS);                        \
25740e23c03SJunchao Zhang     link->FetchAndInsert  = CPPJoin3_(FetchAndInsert_, type,BS);                        \
25840e23c03SJunchao Zhang     link->FetchAndAdd     = CPPJoin3_(FetchAndAdd_,    type,BS);                        \
25940e23c03SJunchao Zhang     link->FetchAndMult    = CPPJoin3_(FetchAndMult_,   type,BS);                        \
26040e23c03SJunchao Zhang     link->unitbytes       = sizeof(type);                                               \
26140e23c03SJunchao Zhang   }
26240e23c03SJunchao Zhang 
26340e23c03SJunchao Zhang /* Comparable types */
26440e23c03SJunchao Zhang #define DEF_PackCmp(type)                                                               \
26540e23c03SJunchao Zhang   DEF_PackAddNoInit(type,1)                                                             \
26640e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,Max,type,1,PetscMax,FUNCTION_OP,IGNORE,const type,const void)  \
26740e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,Min,type,1,PetscMin,FUNCTION_OP,IGNORE,const type,const void)  \
26840e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, Max,type,1,PetscMax,FUNCTION_OP,EXECUTE,type,void)             \
26940e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, Min,type,1,PetscMin,FUNCTION_OP,EXECUTE,type,void)             \
27040e23c03SJunchao Zhang   static void CPPJoin2(PackInit_,type)(PetscSFPack link) {                              \
27140e23c03SJunchao Zhang     link->Pack            = CPPJoin3_(Pack_,           type,1);                         \
27240e23c03SJunchao Zhang     link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,type,1);                         \
27340e23c03SJunchao Zhang     link->UnpackAndAdd    = CPPJoin3_(UnpackAndAdd_,   type,1);                         \
27440e23c03SJunchao Zhang     link->UnpackAndMult   = CPPJoin3_(UnpackAndMult_,  type,1);                         \
27540e23c03SJunchao Zhang     link->UnpackAndMax    = CPPJoin3_(UnpackAndMax_,   type,1);                         \
27640e23c03SJunchao Zhang     link->UnpackAndMin    = CPPJoin3_(UnpackAndMin_,   type,1);                         \
27740e23c03SJunchao Zhang     link->FetchAndInsert  = CPPJoin3_(FetchAndInsert_, type,1);                         \
27840e23c03SJunchao Zhang     link->FetchAndAdd     = CPPJoin3_(FetchAndAdd_ ,   type,1);                         \
27940e23c03SJunchao Zhang     link->FetchAndMult    = CPPJoin3_(FetchAndMult_,   type,1);                         \
28040e23c03SJunchao Zhang     link->FetchAndMax     = CPPJoin3_(FetchAndMax_ ,   type,1);                         \
28140e23c03SJunchao Zhang     link->FetchAndMin     = CPPJoin3_(FetchAndMin_ ,   type,1);                         \
28240e23c03SJunchao Zhang     link->unitbytes       = sizeof(type);                                               \
28340e23c03SJunchao Zhang   }
28440e23c03SJunchao Zhang 
28540e23c03SJunchao Zhang /* Logical Types */
28640e23c03SJunchao Zhang /* The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid
28740e23c03SJunchao Zhang    the compilation warning "empty macro arguments are undefined in ISO C90"
28840e23c03SJunchao Zhang  */
28940e23c03SJunchao Zhang #define DEF_PackLog(type)                                                               \
29040e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,LAND,type,1,&&,BINARY_OP,IGNORE,const type,const void)         \
29140e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,LOR, type,1,||,BINARY_OP,IGNORE,const type,const void)         \
29240e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,LXOR,type,1,&, LXOR_OP,  IGNORE,const type,const void)         \
29340e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, LAND,type,1,&&,BINARY_OP,EXECUTE,type,void)                    \
29440e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, LOR, type,1,||,BINARY_OP,EXECUTE,type,void)                    \
29540e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, LXOR,type,1,&, LXOR_OP,  EXECUTE,type,void)                    \
29640e23c03SJunchao Zhang   static void CPPJoin2(PackInit_Logical_,type)(PetscSFPack link) {                      \
29740e23c03SJunchao Zhang     link->UnpackAndLAND   = CPPJoin3_(UnpackAndLAND_,type,1);                           \
29840e23c03SJunchao Zhang     link->UnpackAndLOR    = CPPJoin3_(UnpackAndLOR_, type,1);                           \
29940e23c03SJunchao Zhang     link->UnpackAndLXOR   = CPPJoin3_(UnpackAndLXOR_,type,1);                           \
30040e23c03SJunchao Zhang     link->FetchAndLAND    = CPPJoin3_(FetchAndLAND_, type,1);                           \
30140e23c03SJunchao Zhang     link->FetchAndLOR     = CPPJoin3_(FetchAndLOR_,  type,1);                           \
30240e23c03SJunchao Zhang     link->FetchAndLXOR    = CPPJoin3_(FetchAndLXOR_, type,1);                           \
30340e23c03SJunchao Zhang   }
30440e23c03SJunchao Zhang 
30540e23c03SJunchao Zhang 
30640e23c03SJunchao Zhang /* Bitwise Types */
30740e23c03SJunchao Zhang #define DEF_PackBit(type)                                                               \
30840e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,BAND,type,1,&,BINARY_OP,IGNORE,const type,const void)          \
30940e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,BOR, type,1,|,BINARY_OP,IGNORE,const type,const void)          \
31040e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,BXOR,type,1,^,BINARY_OP,IGNORE,const type,const void)          \
31140e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, BAND,type,1,&,BINARY_OP,EXECUTE,type,void)                     \
31240e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, BOR, type,1,|,BINARY_OP,EXECUTE,type,void)                     \
31340e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, BXOR,type,1,^,BINARY_OP,EXECUTE,type,void)                     \
31440e23c03SJunchao Zhang   static void CPPJoin2(PackInit_Bitwise_,type)(PetscSFPack link) {                      \
31540e23c03SJunchao Zhang     link->UnpackAndBAND   = CPPJoin3_(UnpackAndBAND_,type,1);                           \
31640e23c03SJunchao Zhang     link->UnpackAndBOR    = CPPJoin3_(UnpackAndBOR_, type,1);                           \
31740e23c03SJunchao Zhang     link->UnpackAndBXOR   = CPPJoin3_(UnpackAndBXOR_,type,1);                           \
31840e23c03SJunchao Zhang     link->FetchAndBAND    = CPPJoin3_(FetchAndBAND_, type,1);                           \
31940e23c03SJunchao Zhang     link->FetchAndBOR     = CPPJoin3_(FetchAndBOR_,  type,1);                           \
32040e23c03SJunchao Zhang     link->FetchAndBXOR    = CPPJoin3_(FetchAndBXOR_, type,1);                           \
32140e23c03SJunchao Zhang   }
32240e23c03SJunchao Zhang 
32340e23c03SJunchao Zhang 
32440e23c03SJunchao Zhang /* Pair types */
32540e23c03SJunchao Zhang #define DEF_PackPair(type1,type2)                                                                                   \
32640e23c03SJunchao Zhang   typedef struct {type1 a; type2 b;} PairType(type1,type2);                                                         \
32740e23c03SJunchao Zhang   DEF_PackFunc(PairType(type1,type2),1)                                                                             \
32840e23c03SJunchao Zhang   DEF_Action(Unpack,PairType(type1,type2),1,IGNORE,const PairType(type1,type2),const void)                          \
32940e23c03SJunchao Zhang   DEF_Action(Fetch, PairType(type1,type2),1,EXECUTE,PairType(type1,type2),void)                                     \
33040e23c03SJunchao Zhang   DEF_ActionAndOp(Unpack,Add,PairType(type1,type2),1,+,PAIRTYPE_OP,IGNORE,const PairType(type1,type2),const void)   \
33140e23c03SJunchao Zhang   DEF_ActionAndOp(Fetch, Add,PairType(type1,type2),1,+,PAIRTYPE_OP,EXECUTE,PairType(type1,type2),void)              \
33240e23c03SJunchao Zhang   DEF_ActionAndXloc(Unpack,Max,type1,type2,>,IGNORE,const PairType(type1,type2),const void)                         \
33340e23c03SJunchao Zhang   DEF_ActionAndXloc(Unpack,Min,type1,type2,<,IGNORE,const PairType(type1,type2),const void)                         \
33440e23c03SJunchao Zhang   DEF_ActionAndXloc(Fetch, Max,type1,type2,>,EXECUTE,PairType(type1,type2),void)                                    \
33540e23c03SJunchao Zhang   DEF_ActionAndXloc(Fetch, Min,type1,type2,<,EXECUTE,PairType(type1,type2),void)                                    \
33640e23c03SJunchao Zhang   static void CPPJoin3_(PackInit_,type1,type2)(PetscSFPack link) {                                                  \
33740e23c03SJunchao Zhang     link->Pack            = CPPJoin3_(Pack_,           PairType(type1,type2),1);                                    \
33840e23c03SJunchao Zhang     link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,PairType(type1,type2),1);                                    \
33940e23c03SJunchao Zhang     link->UnpackAndAdd    = CPPJoin3_(UnpackAndAdd_,   PairType(type1,type2),1);                                    \
34040e23c03SJunchao Zhang     link->UnpackAndMaxloc = CPPJoin3_(UnpackAndMaxloc_,PairType(type1,type2),1);                                    \
34140e23c03SJunchao Zhang     link->UnpackAndMinloc = CPPJoin3_(UnpackAndMinloc_,PairType(type1,type2),1);                                    \
34240e23c03SJunchao Zhang     link->FetchAndInsert  = CPPJoin3_(FetchAndInsert_, PairType(type1,type2),1);                                    \
34340e23c03SJunchao Zhang     link->FetchAndAdd     = CPPJoin3_(FetchAndAdd_,    PairType(type1,type2),1);                                    \
34440e23c03SJunchao Zhang     link->FetchAndMaxloc  = CPPJoin3_(FetchAndMaxloc_, PairType(type1,type2),1);                                    \
34540e23c03SJunchao Zhang     link->FetchAndMinloc  = CPPJoin3_(FetchAndMinloc_, PairType(type1,type2),1);                                    \
34640e23c03SJunchao Zhang     link->unitbytes       = sizeof(PairType(type1,type2));                                                          \
34740e23c03SJunchao Zhang   }
34840e23c03SJunchao Zhang 
34940e23c03SJunchao Zhang 
35040e23c03SJunchao Zhang /* Currently only dumb blocks of data */
35140e23c03SJunchao Zhang #define DEF_Block(type,count)                                                           \
35240e23c03SJunchao Zhang   typedef struct {type v[count];} BlockType(type,count);                                \
35340e23c03SJunchao Zhang   DEF_PackNoInit(BlockType(type,count),1)                                               \
35440e23c03SJunchao Zhang   static void CPPJoin3_(PackInit_block_,type,count)(PetscSFPack link) {                 \
35540e23c03SJunchao Zhang     link->Pack            = CPPJoin3_(Pack_,           BlockType(type,count),1);        \
35640e23c03SJunchao Zhang     link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,BlockType(type,count),1);        \
35740e23c03SJunchao Zhang     link->FetchAndInsert  = CPPJoin3_(FetchAndInsert_, BlockType(type,count),1);        \
35840e23c03SJunchao Zhang     link->unitbytes       = sizeof(BlockType(type,count));                              \
35940e23c03SJunchao Zhang   }
36040e23c03SJunchao Zhang 
36140e23c03SJunchao Zhang /* The typedef is used to get a typename without space that CPPJoin can handle */
36240e23c03SJunchao Zhang typedef signed char SignedChar;
36340e23c03SJunchao Zhang typedef unsigned char UnsignedChar;
36440e23c03SJunchao Zhang 
36540e23c03SJunchao Zhang DEF_PackCmp(SignedChar)
36640e23c03SJunchao Zhang DEF_PackBit(SignedChar)
36740e23c03SJunchao Zhang DEF_PackLog(SignedChar)
36840e23c03SJunchao Zhang DEF_PackCmp(UnsignedChar)
36940e23c03SJunchao Zhang DEF_PackBit(UnsignedChar)
37040e23c03SJunchao Zhang DEF_PackLog(UnsignedChar)
37140e23c03SJunchao Zhang DEF_PackCmp(int)
37240e23c03SJunchao Zhang DEF_PackBit(int)
37340e23c03SJunchao Zhang DEF_PackLog(int)
37440e23c03SJunchao Zhang DEF_PackCmp(PetscInt)
37540e23c03SJunchao Zhang DEF_PackBit(PetscInt)
37640e23c03SJunchao Zhang DEF_PackLog(PetscInt)
37740e23c03SJunchao Zhang DEF_Pack(PetscInt,2)
37840e23c03SJunchao Zhang DEF_Pack(PetscInt,3)
37940e23c03SJunchao Zhang DEF_Pack(PetscInt,4)
38040e23c03SJunchao Zhang DEF_Pack(PetscInt,5)
38140e23c03SJunchao Zhang DEF_Pack(PetscInt,7)
38240e23c03SJunchao Zhang DEF_PackCmp(PetscReal)
38340e23c03SJunchao Zhang DEF_PackLog(PetscReal)
38440e23c03SJunchao Zhang DEF_Pack(PetscReal,2)
38540e23c03SJunchao Zhang DEF_Pack(PetscReal,3)
38640e23c03SJunchao Zhang DEF_Pack(PetscReal,4)
38740e23c03SJunchao Zhang DEF_Pack(PetscReal,5)
38840e23c03SJunchao Zhang DEF_Pack(PetscReal,7)
38940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
39040e23c03SJunchao Zhang DEF_Pack(PetscComplex,1)
39140e23c03SJunchao Zhang DEF_Pack(PetscComplex,2)
39240e23c03SJunchao Zhang DEF_Pack(PetscComplex,3)
39340e23c03SJunchao Zhang DEF_Pack(PetscComplex,4)
39440e23c03SJunchao Zhang DEF_Pack(PetscComplex,5)
39540e23c03SJunchao Zhang DEF_Pack(PetscComplex,7)
39640e23c03SJunchao Zhang #endif
39740e23c03SJunchao Zhang DEF_PackPair(int,int)
39840e23c03SJunchao Zhang DEF_PackPair(PetscInt,PetscInt)
39940e23c03SJunchao Zhang DEF_Block(int,1)
40040e23c03SJunchao Zhang DEF_Block(int,2)
40140e23c03SJunchao Zhang DEF_Block(int,4)
40240e23c03SJunchao Zhang DEF_Block(int,8)
40340e23c03SJunchao Zhang DEF_Block(char,1)
40440e23c03SJunchao Zhang DEF_Block(char,2)
40540e23c03SJunchao Zhang DEF_Block(char,4)
40640e23c03SJunchao Zhang 
40740e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
40840e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
40940e23c03SJunchao Zhang {
41040e23c03SJunchao Zhang   int ierr;
41140e23c03SJunchao Zhang   ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
41240e23c03SJunchao Zhang   ierr = MPI_Type_commit(newtype); if (ierr) return ierr;
41340e23c03SJunchao Zhang   return MPI_SUCCESS;
41440e23c03SJunchao Zhang }
41540e23c03SJunchao Zhang #endif
41640e23c03SJunchao Zhang 
417*9d1c8addSJunchao Zhang PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink)
41840e23c03SJunchao Zhang {
41940e23c03SJunchao Zhang   PetscErrorCode    ierr;
42040e23c03SJunchao Zhang   PetscSFPack       link,*p;
42140e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
42240e23c03SJunchao Zhang 
42340e23c03SJunchao Zhang   PetscFunctionBegin;
42440e23c03SJunchao Zhang   /* Look for types in cache */
42540e23c03SJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
42640e23c03SJunchao Zhang     PetscBool match;
42740e23c03SJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
428*9d1c8addSJunchao Zhang     if (match && (rkey == link->rkey) && (lkey == link->lkey)) {
42940e23c03SJunchao Zhang       switch (cmode) {
43040e23c03SJunchao Zhang       case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
43140e23c03SJunchao Zhang       case PETSC_USE_POINTER: break;
43240e23c03SJunchao Zhang       default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
43340e23c03SJunchao Zhang       }
43440e23c03SJunchao Zhang       *mylink = link;
43540e23c03SJunchao Zhang       PetscFunctionReturn(0);
43640e23c03SJunchao Zhang     }
43740e23c03SJunchao Zhang   }
43840e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
43940e23c03SJunchao Zhang   PetscFunctionReturn(0);
44040e23c03SJunchao Zhang }
44140e23c03SJunchao Zhang 
44240e23c03SJunchao Zhang PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link)
44340e23c03SJunchao Zhang {
44440e23c03SJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
44540e23c03SJunchao Zhang 
44640e23c03SJunchao Zhang   PetscFunctionBegin;
447*9d1c8addSJunchao Zhang   (*link)->rkey = NULL;
448*9d1c8addSJunchao Zhang   (*link)->lkey = NULL;
44940e23c03SJunchao Zhang   (*link)->next = bas->avail;
45040e23c03SJunchao Zhang   bas->avail    = *link;
45140e23c03SJunchao Zhang   *link         = NULL;
45240e23c03SJunchao Zhang   PetscFunctionReturn(0);
45340e23c03SJunchao Zhang }
45440e23c03SJunchao Zhang 
455*9d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
456*9d1c8addSJunchao Zhang PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey)
457*9d1c8addSJunchao Zhang {
458*9d1c8addSJunchao Zhang   PetscErrorCode    ierr;
459*9d1c8addSJunchao Zhang   PetscSFPack       link,*p;
460*9d1c8addSJunchao Zhang   PetscSF_Basic     *bas=(PetscSF_Basic*)sf->data;
461*9d1c8addSJunchao Zhang   PetscBool         match;
462*9d1c8addSJunchao Zhang 
463*9d1c8addSJunchao Zhang   PetscFunctionBegin;
464*9d1c8addSJunchao Zhang   /* Look up links in use and error out if there is a match */
465*9d1c8addSJunchao Zhang   for (p=&bas->inuse; (link=*p); p=&link->next) {
466*9d1c8addSJunchao Zhang     ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr);
467*9d1c8addSJunchao Zhang     if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata, leafdatadata and data type. You can undo the overlap to avoid the error.");
468*9d1c8addSJunchao Zhang   }
469*9d1c8addSJunchao Zhang   PetscFunctionReturn(0);
470*9d1c8addSJunchao Zhang }
471*9d1c8addSJunchao Zhang 
47240e23c03SJunchao Zhang PetscErrorCode PetscSFPackSetupType(PetscSFPack link,MPI_Datatype unit)
47340e23c03SJunchao Zhang {
47440e23c03SJunchao Zhang   PetscErrorCode ierr;
47540e23c03SJunchao Zhang   PetscBool      isInt,isPetscInt,isPetscReal,is2Int,is2PetscInt,isSignedChar,isUnsignedChar;
47640e23c03SJunchao Zhang   PetscInt       nPetscIntContig,nPetscRealContig;
47740e23c03SJunchao Zhang   PetscMPIInt    ni,na,nd,combiner;
47840e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
47940e23c03SJunchao Zhang   PetscBool isPetscComplex;
48040e23c03SJunchao Zhang   PetscInt nPetscComplexContig;
48140e23c03SJunchao Zhang #endif
48240e23c03SJunchao Zhang 
48340e23c03SJunchao Zhang   PetscFunctionBegin;
48440e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_SIGNED_CHAR,&isSignedChar);CHKERRQ(ierr);
48540e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_UNSIGNED_CHAR,&isUnsignedChar);CHKERRQ(ierr);
48640e23c03SJunchao Zhang   /* MPI_CHAR is treated below as a dumb block type that does not support reduction according to MPI standard */
48740e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_INT,&isInt);CHKERRQ(ierr);
48840e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_INT,&isPetscInt);CHKERRQ(ierr);
48940e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT,&nPetscIntContig);CHKERRQ(ierr);
49040e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_REAL,&isPetscReal);CHKERRQ(ierr);
49140e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscRealContig);CHKERRQ(ierr);
49240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
49340e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_COMPLEX,&isPetscComplex);CHKERRQ(ierr);
49440e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplexContig);CHKERRQ(ierr);
49540e23c03SJunchao Zhang #endif
49640e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
49740e23c03SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
49840e23c03SJunchao Zhang   ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr);
49940e23c03SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE;
50040e23c03SJunchao Zhang   link->bs = 1;
50140e23c03SJunchao Zhang 
50240e23c03SJunchao Zhang   if (isSignedChar) {PackInit_SignedChar(link); PackInit_Logical_SignedChar(link); PackInit_Bitwise_SignedChar(link); link->basicunit = MPI_SIGNED_CHAR;}
50340e23c03SJunchao Zhang   else if (isUnsignedChar) {PackInit_UnsignedChar(link); PackInit_Logical_UnsignedChar(link); PackInit_Bitwise_UnsignedChar(link); link->basicunit = MPI_UNSIGNED_CHAR;}
50440e23c03SJunchao Zhang   else if (isInt) {PackInit_int(link); PackInit_Logical_int(link); PackInit_Bitwise_int(link); link->basicunit = MPI_INT;}
50540e23c03SJunchao Zhang   else if (isPetscInt) {PackInit_PetscInt(link); PackInit_Logical_PetscInt(link); PackInit_Bitwise_PetscInt(link); link->basicunit = MPIU_INT;}
50640e23c03SJunchao Zhang   else if (isPetscReal) {PackInit_PetscReal(link); PackInit_Logical_PetscReal(link); link->basicunit = MPIU_REAL;}
50740e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
50840e23c03SJunchao Zhang   else if (isPetscComplex) {PackInit_PetscComplex_1(link); link->basicunit = MPIU_COMPLEX;}
50940e23c03SJunchao Zhang #endif
51040e23c03SJunchao Zhang   else if (is2Int) {PackInit_int_int(link); link->basicunit = MPI_2INT;}
51140e23c03SJunchao Zhang   else if (is2PetscInt) {PackInit_PetscInt_PetscInt(link); link->basicunit = MPIU_2INT;}
51240e23c03SJunchao Zhang   else if (nPetscIntContig) {
51340e23c03SJunchao Zhang     if (nPetscIntContig%7 == 0) PackInit_PetscInt_7(link);
51440e23c03SJunchao Zhang     else if (nPetscIntContig%5 == 0) PackInit_PetscInt_5(link);
51540e23c03SJunchao Zhang     else if (nPetscIntContig%4 == 0) PackInit_PetscInt_4(link);
51640e23c03SJunchao Zhang     else if (nPetscIntContig%3 == 0) PackInit_PetscInt_3(link);
51740e23c03SJunchao Zhang     else if (nPetscIntContig%2 == 0) PackInit_PetscInt_2(link);
51840e23c03SJunchao Zhang     else PackInit_PetscInt(link);
51940e23c03SJunchao Zhang     link->bs = nPetscIntContig;
52040e23c03SJunchao Zhang     link->unitbytes *= nPetscIntContig;
52140e23c03SJunchao Zhang     link->basicunit = MPIU_INT;
52240e23c03SJunchao Zhang   } else if (nPetscRealContig) {
52340e23c03SJunchao Zhang     if (nPetscRealContig%7 == 0) PackInit_PetscReal_7(link);
52440e23c03SJunchao Zhang     else if (nPetscRealContig%5 == 0) PackInit_PetscReal_5(link);
52540e23c03SJunchao Zhang     else if (nPetscRealContig%4 == 0) PackInit_PetscReal_4(link);
52640e23c03SJunchao Zhang     else if (nPetscRealContig%3 == 0) PackInit_PetscReal_3(link);
52740e23c03SJunchao Zhang     else if (nPetscRealContig%2 == 0) PackInit_PetscReal_2(link);
52840e23c03SJunchao Zhang     else PackInit_PetscReal(link);
52940e23c03SJunchao Zhang     link->bs = nPetscRealContig;
53040e23c03SJunchao Zhang     link->unitbytes *= nPetscRealContig;
53140e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
53240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
53340e23c03SJunchao Zhang   } else if (nPetscComplexContig) {
53440e23c03SJunchao Zhang     if (nPetscComplexContig%7 == 0) PackInit_PetscComplex_7(link);
53540e23c03SJunchao Zhang     else if (nPetscComplexContig%5 == 0) PackInit_PetscComplex_5(link);
53640e23c03SJunchao Zhang     else if (nPetscComplexContig%4 == 0) PackInit_PetscComplex_4(link);
53740e23c03SJunchao Zhang     else if (nPetscComplexContig%3 == 0) PackInit_PetscComplex_3(link);
53840e23c03SJunchao Zhang     else if (nPetscComplexContig%2 == 0) PackInit_PetscComplex_2(link);
53940e23c03SJunchao Zhang     else PackInit_PetscComplex_1(link);
54040e23c03SJunchao Zhang     link->bs = nPetscComplexContig;
54140e23c03SJunchao Zhang     link->unitbytes *= nPetscComplexContig;
54240e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
54340e23c03SJunchao Zhang #endif
54440e23c03SJunchao Zhang   } else {
54540e23c03SJunchao Zhang     MPI_Aint lb,bytes;
54640e23c03SJunchao Zhang     ierr = MPI_Type_get_extent(unit,&lb,&bytes);CHKERRQ(ierr);
54740e23c03SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
54840e23c03SJunchao Zhang     if (bytes % sizeof(int)) { /* If the type size is not multiple of int */
54940e23c03SJunchao Zhang       if      (bytes%4 == 0) {PackInit_block_char_4(link); link->bs = bytes/4;} /* Note the basic type is char[4] */
55040e23c03SJunchao Zhang       else if (bytes%2 == 0) {PackInit_block_char_2(link); link->bs = bytes/2;}
55140e23c03SJunchao Zhang       else                   {PackInit_block_char_1(link); link->bs = bytes/1;}
55240e23c03SJunchao Zhang       link->unitbytes = bytes;
55340e23c03SJunchao Zhang       link->basicunit = MPI_CHAR;
55440e23c03SJunchao Zhang     } else {
55540e23c03SJunchao Zhang       PetscInt nInt = bytes / sizeof(int);
55640e23c03SJunchao Zhang       if      (nInt%8 == 0)  {PackInit_block_int_8(link);  link->bs = nInt/8;} /* Note the basic type is int[8] */
55740e23c03SJunchao Zhang       else if (nInt%4 == 0)  {PackInit_block_int_4(link);  link->bs = nInt/4;}
55840e23c03SJunchao Zhang       else if (nInt%2 == 0)  {PackInit_block_int_2(link);  link->bs = nInt/2;}
55940e23c03SJunchao Zhang       else                   {PackInit_block_int_1(link);  link->bs = nInt/1;}
56040e23c03SJunchao Zhang       link->unitbytes = bytes;
56140e23c03SJunchao Zhang       link->basicunit = MPI_INT;
56240e23c03SJunchao Zhang     }
56340e23c03SJunchao Zhang   }
56440e23c03SJunchao Zhang   if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */
56540e23c03SJunchao Zhang   else {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);}
56640e23c03SJunchao Zhang   PetscFunctionReturn(0);
56740e23c03SJunchao Zhang }
56840e23c03SJunchao Zhang 
56940e23c03SJunchao Zhang PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**UnpackAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*))
57040e23c03SJunchao Zhang {
57140e23c03SJunchao Zhang   PetscFunctionBegin;
57240e23c03SJunchao Zhang   *UnpackAndOp = NULL;
57340e23c03SJunchao Zhang   if (op == MPIU_REPLACE) *UnpackAndOp = link->UnpackAndInsert;
57440e23c03SJunchao Zhang   else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->UnpackAndAdd;
57540e23c03SJunchao Zhang   else if (op == MPI_PROD) *UnpackAndOp = link->UnpackAndMult;
57640e23c03SJunchao Zhang   else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->UnpackAndMax;
57740e23c03SJunchao Zhang   else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->UnpackAndMin;
57840e23c03SJunchao Zhang   else if (op == MPI_LAND)   *UnpackAndOp = link->UnpackAndLAND;
57940e23c03SJunchao Zhang   else if (op == MPI_BAND)   *UnpackAndOp = link->UnpackAndBAND;
58040e23c03SJunchao Zhang   else if (op == MPI_LOR)    *UnpackAndOp = link->UnpackAndLOR;
58140e23c03SJunchao Zhang   else if (op == MPI_BOR)    *UnpackAndOp = link->UnpackAndBOR;
58240e23c03SJunchao Zhang   else if (op == MPI_LXOR)   *UnpackAndOp = link->UnpackAndLXOR;
58340e23c03SJunchao Zhang   else if (op == MPI_BXOR)   *UnpackAndOp = link->UnpackAndBXOR;
58440e23c03SJunchao Zhang   else if (op == MPI_MAXLOC) *UnpackAndOp = link->UnpackAndMaxloc;
58540e23c03SJunchao Zhang   else if (op == MPI_MINLOC) *UnpackAndOp = link->UnpackAndMinloc;
58640e23c03SJunchao Zhang   else *UnpackAndOp = NULL;
58740e23c03SJunchao Zhang   PetscFunctionReturn(0);
58840e23c03SJunchao Zhang }
58940e23c03SJunchao Zhang 
59040e23c03SJunchao Zhang PetscErrorCode PetscSFPackGetFetchAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**FetchAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*))
59140e23c03SJunchao Zhang {
59240e23c03SJunchao Zhang   PetscFunctionBegin;
59340e23c03SJunchao Zhang   *FetchAndOp = NULL;
59440e23c03SJunchao Zhang   if (op == MPIU_REPLACE) *FetchAndOp = link->FetchAndInsert;
59540e23c03SJunchao Zhang   else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->FetchAndAdd;
59640e23c03SJunchao Zhang   else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->FetchAndMax;
59740e23c03SJunchao Zhang   else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->FetchAndMin;
59840e23c03SJunchao Zhang   else if (op == MPI_MAXLOC) *FetchAndOp = link->FetchAndMaxloc;
59940e23c03SJunchao Zhang   else if (op == MPI_MINLOC) *FetchAndOp = link->FetchAndMinloc;
60040e23c03SJunchao Zhang   else if (op == MPI_PROD)   *FetchAndOp = link->FetchAndMult;
60140e23c03SJunchao Zhang   else if (op == MPI_LAND)   *FetchAndOp = link->FetchAndLAND;
60240e23c03SJunchao Zhang   else if (op == MPI_BAND)   *FetchAndOp = link->FetchAndBAND;
60340e23c03SJunchao Zhang   else if (op == MPI_LOR)    *FetchAndOp = link->FetchAndLOR;
60440e23c03SJunchao Zhang   else if (op == MPI_BOR)    *FetchAndOp = link->FetchAndBOR;
60540e23c03SJunchao Zhang   else if (op == MPI_LXOR)   *FetchAndOp = link->FetchAndLXOR;
60640e23c03SJunchao Zhang   else if (op == MPI_BXOR)   *FetchAndOp = link->FetchAndBXOR;
60740e23c03SJunchao Zhang   else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for MPI_Op");
60840e23c03SJunchao Zhang   PetscFunctionReturn(0);
60940e23c03SJunchao Zhang }
61040e23c03SJunchao Zhang 
61140e23c03SJunchao Zhang /*
61240e23c03SJunchao Zhang   Setup pack/unpack optimization plans based on indice patterns available
61340e23c03SJunchao Zhang 
61440e23c03SJunchao Zhang    Input Parameters:
61540e23c03SJunchao Zhang   +  n       - number of target processors
61640e23c03SJunchao Zhang   .  offset  - [n+1] for the i-th processor, its associated indices are idx[offset[i], offset[i+1])
61740e23c03SJunchao Zhang   -  idx     - [] array storing indices. Its length is offset[n+1]
61840e23c03SJunchao Zhang 
61940e23c03SJunchao Zhang    Output Parameters:
62040e23c03SJunchao Zhang   +  opt    - the optimization
62140e23c03SJunchao Zhang */
62240e23c03SJunchao Zhang PetscErrorCode PetscSFPackSetupOptimization(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
62340e23c03SJunchao Zhang {
62440e23c03SJunchao Zhang   PetscErrorCode ierr;
62540e23c03SJunchao Zhang   PetscInt       i,j,k,n_copies,tot_copies=0,step;
62640e23c03SJunchao Zhang   PetscBool      strided,has_strided=PETSC_FALSE,has_optimized=PETSC_FALSE;
62740e23c03SJunchao Zhang   PetscSFPackOpt opt;
62840e23c03SJunchao Zhang 
62940e23c03SJunchao Zhang   PetscFunctionBegin;
63040e23c03SJunchao Zhang   ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr);
63140e23c03SJunchao Zhang   ierr = PetscCalloc2(n,&opt->optimized,n+1,&opt->copy_offset);CHKERRQ(ierr);
63240e23c03SJunchao Zhang 
63340e23c03SJunchao Zhang   /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */
63440e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor */
63540e23c03SJunchao Zhang     /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */
63640e23c03SJunchao Zhang     n_copies = 1;
63740e23c03SJunchao Zhang     for (j=offset[i]; j<offset[i+1]-1; j++) {
63840e23c03SJunchao Zhang       if (idx[j]+1 != idx[j+1]) n_copies++;
63940e23c03SJunchao Zhang     }
64040e23c03SJunchao Zhang     /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32,
64140e23c03SJunchao Zhang        then it is worth using memcpy for this target. 32 is an arbitrarily chosen number.
64240e23c03SJunchao Zhang      */
64340e23c03SJunchao Zhang     if ((offset[i+1]-offset[i])/n_copies >= 32) {
64440e23c03SJunchao Zhang       opt->optimized[i] = PETSC_TRUE;
64540e23c03SJunchao Zhang       has_optimized     = PETSC_TRUE;
64640e23c03SJunchao Zhang       tot_copies       += n_copies;
64740e23c03SJunchao Zhang     }
64840e23c03SJunchao Zhang   }
64940e23c03SJunchao Zhang 
65040e23c03SJunchao Zhang   /* Setup memcpy plan for each contiguous piece */
65140e23c03SJunchao Zhang   k    = 0; /* k-th copy */
65240e23c03SJunchao Zhang   ierr = PetscMalloc2(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length);CHKERRQ(ierr);
65340e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each target processor procs[i] */
65440e23c03SJunchao Zhang     if (opt->optimized[i]) {
65540e23c03SJunchao Zhang       n_copies           = 1;
65640e23c03SJunchao Zhang       opt->copy_start[k] = idx[offset[i]];
65740e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
65840e23c03SJunchao Zhang         if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */
65940e23c03SJunchao Zhang           n_copies++;
66040e23c03SJunchao Zhang           opt->copy_start[k+1] = idx[j+1];
66140e23c03SJunchao Zhang           opt->copy_length[k]  = idx[j] - opt->copy_start[k] + 1;
66240e23c03SJunchao Zhang           k++;
66340e23c03SJunchao Zhang         }
66440e23c03SJunchao Zhang       }
66540e23c03SJunchao Zhang       /* Set copy length of the last copy for this target */
66640e23c03SJunchao Zhang       opt->copy_length[k] = idx[j] - opt->copy_start[k] + 1;
66740e23c03SJunchao Zhang       k++;
66840e23c03SJunchao Zhang     }
66940e23c03SJunchao Zhang     /* Set offset for next target. When optimized[i]=false, copy_offsets[i]=copy_offsets[i+1] */
67040e23c03SJunchao Zhang     opt->copy_offset[i+1] = k;
67140e23c03SJunchao Zhang   }
67240e23c03SJunchao Zhang 
67340e23c03SJunchao Zhang   /* Last chance! If the indices do not have long contiguous pieces, are they strided? */
67440e23c03SJunchao Zhang   ierr = PetscMalloc3(n,&opt->stride_first,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr);
67540e23c03SJunchao Zhang   for (i=0; i<n; i++) { /* for each remote */
67640e23c03SJunchao Zhang     if (!opt->optimized[i] && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */
67740e23c03SJunchao Zhang       strided = PETSC_TRUE;
67840e23c03SJunchao Zhang       step    = idx[offset[i]+1] - idx[offset[i]];
67940e23c03SJunchao Zhang       for (j=offset[i]; j<offset[i+1]-1; j++) {
68040e23c03SJunchao Zhang         if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; }
68140e23c03SJunchao Zhang       }
68240e23c03SJunchao Zhang       if (strided) {
68340e23c03SJunchao Zhang         opt->optimized[i]    = PETSC_TRUE;
68440e23c03SJunchao Zhang         opt->stride_first[i] = idx[offset[i]];
68540e23c03SJunchao Zhang         opt->stride_step[i]  = step;
68640e23c03SJunchao Zhang         opt->stride_n[i]     = offset[i+1] - offset[i];
68740e23c03SJunchao Zhang         has_strided          = PETSC_TRUE;
68840e23c03SJunchao Zhang         has_optimized        = PETSC_TRUE;
68940e23c03SJunchao Zhang       }
69040e23c03SJunchao Zhang     }
69140e23c03SJunchao Zhang   }
69240e23c03SJunchao Zhang   /* If no target has been stride-optimized or optimized, free related arrays to save memory */
69340e23c03SJunchao Zhang   if (!has_strided) {ierr = PetscFree3(opt->stride_first,opt->stride_step,opt->stride_n);CHKERRQ(ierr);}
69440e23c03SJunchao Zhang   if (!has_optimized) {
69540e23c03SJunchao Zhang     ierr = PetscFree2(opt->optimized,opt->copy_offset);CHKERRQ(ierr);
69640e23c03SJunchao Zhang     ierr = PetscFree2(opt->copy_start,opt->copy_length);CHKERRQ(ierr);
69740e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
69840e23c03SJunchao Zhang     *out = NULL;
69940e23c03SJunchao Zhang   } else *out = opt;
70040e23c03SJunchao Zhang   PetscFunctionReturn(0);
70140e23c03SJunchao Zhang }
70240e23c03SJunchao Zhang 
70340e23c03SJunchao Zhang PetscErrorCode PetscSFPackDestoryOptimization(PetscSFPackOpt *out)
70440e23c03SJunchao Zhang {
70540e23c03SJunchao Zhang   PetscErrorCode ierr;
70640e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
70740e23c03SJunchao Zhang 
70840e23c03SJunchao Zhang   PetscFunctionBegin;
70940e23c03SJunchao Zhang   if (opt) {
71040e23c03SJunchao Zhang     ierr = PetscFree2(opt->optimized,opt->copy_offset);CHKERRQ(ierr);
71140e23c03SJunchao Zhang     ierr = PetscFree2(opt->copy_start,opt->copy_length);CHKERRQ(ierr);
71240e23c03SJunchao Zhang     ierr = PetscFree3(opt->stride_first,opt->stride_step,opt->stride_n);CHKERRQ(ierr);
71340e23c03SJunchao Zhang     ierr = PetscFree(opt);CHKERRQ(ierr);
71440e23c03SJunchao Zhang     *out = NULL;
71540e23c03SJunchao Zhang   }
71640e23c03SJunchao Zhang   PetscFunctionReturn(0);
71740e23c03SJunchao Zhang }
718