140e23c03SJunchao Zhang 240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 440e23c03SJunchao Zhang 540e23c03SJunchao Zhang /* 640e23c03SJunchao Zhang * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 740e23c03SJunchao Zhang * therefore we pack data types manually. This file defines packing routines for the standard data types. 840e23c03SJunchao Zhang */ 940e23c03SJunchao Zhang 10*b23bfdefSJunchao Zhang #define CPPJoin2(a,b) a ##_## b 11*b23bfdefSJunchao Zhang #define CPPJoin3(a,b,c) a ##_## b ##_## c 12*b23bfdefSJunchao Zhang #define CPPJoin4_(a,b,c,d) a##b##_##c##_##d 13*b23bfdefSJunchao Zhang #define CPPJoin4(a,b,c,d) CPPJoin4_(a##_,b,c,d) 1440e23c03SJunchao Zhang 1540e23c03SJunchao Zhang #define EXECUTE(statement) statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */ 1640e23c03SJunchao Zhang #define IGNORE(statement) do {} while(0) 1740e23c03SJunchao Zhang 1840e23c03SJunchao Zhang #define BINARY_OP(r,s,op,t) do {(r) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 1940e23c03SJunchao Zhang #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 2040e23c03SJunchao Zhang #define LXOR_OP(r,s,op,t) do {(r) = (!s) != (!t);} while(0) /* logical exclusive OR */ 2140e23c03SJunchao Zhang #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0) 2240e23c03SJunchao Zhang 23*b23bfdefSJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */ 2440e23c03SJunchao Zhang 2540e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine 2640e23c03SJunchao Zhang 2740e23c03SJunchao Zhang Arguments of the macro: 28*b23bfdefSJunchao Zhang +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 29*b23bfdefSJunchao Zhang .BS Block size for vectorization. It is a factor of bs. 30*b23bfdefSJunchao Zhang -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 3140e23c03SJunchao Zhang 3240e23c03SJunchao Zhang Arguments of the Pack routine: 33*b23bfdefSJunchao Zhang +count Number of indices in idx[] 34*b23bfdefSJunchao Zhang .idx Indices of entries to packed. NULL means contiguous indices, that is [0,count) 3540e23c03SJunchao Zhang .bs Number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int 36*b23bfdefSJunchao Zhang .opt Pack optimization plans. NULL means no plan at all. 37*b23bfdefSJunchao Zhang .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count) 38*b23bfdefSJunchao Zhang -packed Address of the packed data for each rank 3940e23c03SJunchao Zhang */ 40*b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \ 41*b23bfdefSJunchao Zhang static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,const void *unpacked,void *packed) \ 42*b23bfdefSJunchao Zhang { \ 4340e23c03SJunchao Zhang PetscErrorCode ierr; \ 44*b23bfdefSJunchao Zhang const Type *u = (const Type*)unpacked,*u2; \ 45*b23bfdefSJunchao Zhang Type *p = (Type*)packed,*p2; \ 46*b23bfdefSJunchao Zhang PetscInt i,j,k,l,r,step; \ 47*b23bfdefSJunchao Zhang const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 48*b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 4940e23c03SJunchao Zhang PetscFunctionBegin; \ 50*b23bfdefSJunchao Zhang if (!idx) {ierr = PetscArraycpy(p,u,MBS*count);CHKERRQ(ierr);} /* Indices are contiguous */ \ 51*b23bfdefSJunchao Zhang else if (!opt) { /* No optimizations available */ \ 52*b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 53*b23bfdefSJunchao Zhang for (j=0; j<M; j++) /* Decent compilers should fuse this loop when M = const 1 */ \ 54*b23bfdefSJunchao Zhang for (k=0; k<BS; k++) /* Compiler either fuses (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 55*b23bfdefSJunchao Zhang p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 56*b23bfdefSJunchao Zhang } else { \ 57*b23bfdefSJunchao Zhang for (r=0; r<opt->n; r++) { \ 58*b23bfdefSJunchao Zhang p2 = p + opt->offset[r]*MBS; \ 59*b23bfdefSJunchao Zhang if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 60*b23bfdefSJunchao Zhang idx2 = idx + opt->offset[r]; \ 61*b23bfdefSJunchao Zhang for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 62*b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 63*b23bfdefSJunchao Zhang for (k=0; k<BS; k++) \ 64*b23bfdefSJunchao Zhang p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k]; \ 65*b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 6640e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \ 67*b23bfdefSJunchao Zhang u2 = u + idx[opt->copy_start[i]]*MBS; \ 68*b23bfdefSJunchao Zhang l = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */ \ 69*b23bfdefSJunchao Zhang ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr); \ 70*b23bfdefSJunchao Zhang p2 += l; \ 7140e23c03SJunchao Zhang } \ 72*b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 73*b23bfdefSJunchao Zhang u2 = u + idx[opt->offset[r]]*MBS; \ 7440e23c03SJunchao Zhang step = opt->stride_step[r]; \ 7540e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 76*b23bfdefSJunchao Zhang for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j]; \ 77*b23bfdefSJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 7840e23c03SJunchao Zhang } \ 7940e23c03SJunchao Zhang } \ 8040e23c03SJunchao Zhang PetscFunctionReturn(0); \ 8140e23c03SJunchao Zhang } 8240e23c03SJunchao Zhang 8340e23c03SJunchao Zhang /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine 8440e23c03SJunchao Zhang 8540e23c03SJunchao Zhang Arguments: 8640e23c03SJunchao Zhang +action Unpack or Fetch 87*b23bfdefSJunchao Zhang .Type Type of the data 8840e23c03SJunchao Zhang .BS Block size for vectorization 89*b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 9040e23c03SJunchao Zhang .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 91*b23bfdefSJunchao Zhang .CType Type with or without the const qualifier, i.e., const Type or Type 92*b23bfdefSJunchao Zhang .Cvoid void with or without the const qualifier, i.e., const void or void 9340e23c03SJunchao Zhang 9440e23c03SJunchao Zhang Notes: 9540e23c03SJunchao Zhang This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 96*b23bfdefSJunchao Zhang The two arguments CType and Cvoid are used (instead of one constness argument), because we want to 9740e23c03SJunchao Zhang get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument, 9840e23c03SJunchao Zhang sometimes we input 'const', sometimes we have to input empty. 99*b23bfdefSJunchao Zhang 100*b23bfdefSJunchao Zhang If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare. 10140e23c03SJunchao Zhang */ 102*b23bfdefSJunchao Zhang #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid) \ 103*b23bfdefSJunchao Zhang static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \ 104*b23bfdefSJunchao Zhang { \ 10540e23c03SJunchao Zhang PetscErrorCode ierr; \ 106*b23bfdefSJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 107*b23bfdefSJunchao Zhang CType *p = (CType*)packed,*p2; \ 108*b23bfdefSJunchao Zhang PetscInt i,j,k,l,r,step; \ 109*b23bfdefSJunchao Zhang const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 110*b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 11140e23c03SJunchao Zhang PetscFunctionBegin; \ 112*b23bfdefSJunchao Zhang if (!idx) { \ 113*b23bfdefSJunchao Zhang FILTER(Type *v); \ 114*b23bfdefSJunchao Zhang FILTER(ierr = PetscMalloc1(count*MBS,&v);CHKERRQ(ierr)); \ 115*b23bfdefSJunchao Zhang FILTER(ierr = PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr)); \ 116*b23bfdefSJunchao Zhang ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr); \ 117*b23bfdefSJunchao Zhang FILTER(ierr = PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr)); \ 11840e23c03SJunchao Zhang FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 119*b23bfdefSJunchao Zhang } else if (!opt) { /* No optimizations available */ \ 120*b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 121*b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 122*b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 123*b23bfdefSJunchao Zhang FILTER(Type t = u[idx[i]*MBS+j*BS+k]); \ 124*b23bfdefSJunchao Zhang u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 125*b23bfdefSJunchao Zhang FILTER(p[i*MBS+j*BS+k] = t); \ 12640e23c03SJunchao Zhang } \ 127*b23bfdefSJunchao Zhang } else { \ 128*b23bfdefSJunchao Zhang for (r=0; r<opt->n; r++) { \ 129*b23bfdefSJunchao Zhang p2 = p + opt->offset[r]*MBS; \ 130*b23bfdefSJunchao Zhang if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 131*b23bfdefSJunchao Zhang idx2 = idx + opt->offset[r]; \ 132*b23bfdefSJunchao Zhang for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 133*b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 134*b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 135*b23bfdefSJunchao Zhang FILTER(Type t = u[idx2[i]*MBS+j*BS+k]); \ 136*b23bfdefSJunchao Zhang u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 137*b23bfdefSJunchao Zhang FILTER(p2[i*MBS+j*BS+k] = t); \ 13840e23c03SJunchao Zhang } \ 139*b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 140*b23bfdefSJunchao Zhang FILTER(Type *v); \ 141*b23bfdefSJunchao Zhang FILTER(ierr = PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \ 14240e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 143*b23bfdefSJunchao Zhang u2 = u + idx[opt->copy_start[i]]*MBS; \ 144*b23bfdefSJunchao Zhang l = opt->copy_length[i]*MBS; \ 145da2e4c71SJunchao Zhang FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr)); \ 146*b23bfdefSJunchao Zhang ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr); \ 147*b23bfdefSJunchao Zhang FILTER(ierr = PetscArraycpy(p2,v,l);CHKERRQ(ierr)); \ 148*b23bfdefSJunchao Zhang p2 += l; \ 14940e23c03SJunchao Zhang } \ 15040e23c03SJunchao Zhang FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 151*b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 152*b23bfdefSJunchao Zhang u2 = u + idx[opt->offset[r]]*MBS; \ 15340e23c03SJunchao Zhang step = opt->stride_step[r]; \ 15440e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 155*b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 156*b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 157*b23bfdefSJunchao Zhang FILTER(Type t = u2[i*step*MBS+j*BS+k]); \ 158*b23bfdefSJunchao Zhang u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 159*b23bfdefSJunchao Zhang FILTER(p2[i*MBS+j*BS+k] = t); \ 16040e23c03SJunchao Zhang } \ 161*b23bfdefSJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 16240e23c03SJunchao Zhang } \ 16340e23c03SJunchao Zhang } \ 16440e23c03SJunchao Zhang PetscFunctionReturn(0); \ 16540e23c03SJunchao Zhang } 16640e23c03SJunchao Zhang 16740e23c03SJunchao Zhang /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc 16840e23c03SJunchao Zhang 16940e23c03SJunchao Zhang Arguments: 17040e23c03SJunchao Zhang +action Unpack or Fetch 17140e23c03SJunchao Zhang .opname Name of the Op, such as Add, Mult, LAND, etc. 172*b23bfdefSJunchao Zhang .Type Type of the data 17340e23c03SJunchao Zhang .BS Block size for vectorization 174*b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 17540e23c03SJunchao Zhang .op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 17640e23c03SJunchao Zhang .APPLY Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP 17740e23c03SJunchao Zhang .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 178*b23bfdefSJunchao Zhang .CType Type with or without the const qualifier, i.e., const Type or Type 179*b23bfdefSJunchao Zhang -Cvoid void with or without the const qualifier, i.e., const void or void 18040e23c03SJunchao Zhang */ 181*b23bfdefSJunchao Zhang #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \ 182*b23bfdefSJunchao Zhang static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \ 183*b23bfdefSJunchao Zhang { \ 184*b23bfdefSJunchao Zhang Type *u = (Type*)unpacked,*u2,t; \ 185*b23bfdefSJunchao Zhang CType *p = (CType*)packed,*p2; \ 186*b23bfdefSJunchao Zhang PetscInt i,j,k,l,r,step; \ 187*b23bfdefSJunchao Zhang const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 188*b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 18940e23c03SJunchao Zhang PetscFunctionBegin; \ 190*b23bfdefSJunchao Zhang if (!idx) { \ 191*b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 192*b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 193*b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 194*b23bfdefSJunchao Zhang t = u[i*MBS+j*BS+k]; \ 195*b23bfdefSJunchao Zhang APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \ 196*b23bfdefSJunchao Zhang FILTER(p[i*MBS+j*BS+k] = t); \ 19740e23c03SJunchao Zhang } \ 198*b23bfdefSJunchao Zhang } else if (!opt) { /* No optimizations available */ \ 199*b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 200*b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 201*b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 202*b23bfdefSJunchao Zhang t = u[idx[i]*MBS+j*BS+k]; \ 203*b23bfdefSJunchao Zhang APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \ 204*b23bfdefSJunchao Zhang FILTER(p[i*MBS+j*BS+k] = t); \ 20540e23c03SJunchao Zhang } \ 206*b23bfdefSJunchao Zhang } else { \ 207*b23bfdefSJunchao Zhang for (r=0; r<opt->n; r++) { \ 208*b23bfdefSJunchao Zhang p2 = p + opt->offset[r]*MBS; \ 209*b23bfdefSJunchao Zhang if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 210*b23bfdefSJunchao Zhang idx2 = idx + opt->offset[r]; \ 211*b23bfdefSJunchao Zhang for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 212*b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 213*b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 214*b23bfdefSJunchao Zhang t = u[idx2[i]*MBS+j*BS+k]; \ 215*b23bfdefSJunchao Zhang APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \ 216*b23bfdefSJunchao Zhang FILTER(p2[i*MBS+j*BS+k] = t); \ 21740e23c03SJunchao Zhang } \ 218*b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 21940e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 220*b23bfdefSJunchao Zhang u2 = u + idx[opt->copy_start[i]]*MBS; \ 221*b23bfdefSJunchao Zhang l = opt->copy_length[i]*MBS; \ 22240e23c03SJunchao Zhang for (j=0; j<l; j++) { \ 22340e23c03SJunchao Zhang t = u2[j]; \ 224*b23bfdefSJunchao Zhang APPLY (u2[j],t,op,p2[j]); \ 225*b23bfdefSJunchao Zhang FILTER(p2[j] = t); \ 22640e23c03SJunchao Zhang } \ 227*b23bfdefSJunchao Zhang p2 += l; \ 22840e23c03SJunchao Zhang } \ 229*b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 230*b23bfdefSJunchao Zhang u2 = u + idx[opt->offset[r]]*MBS; \ 23140e23c03SJunchao Zhang step = opt->stride_step[r]; \ 23240e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 233*b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 234*b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 235*b23bfdefSJunchao Zhang t = u2[i*step*MBS+j*BS+k]; \ 236*b23bfdefSJunchao Zhang APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \ 237*b23bfdefSJunchao Zhang FILTER(p2[i*MBS+j*BS+k] = t); \ 23840e23c03SJunchao Zhang } \ 239*b23bfdefSJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 24040e23c03SJunchao Zhang } \ 24140e23c03SJunchao Zhang } \ 24240e23c03SJunchao Zhang PetscFunctionReturn(0); \ 24340e23c03SJunchao Zhang } 24440e23c03SJunchao Zhang 24540e23c03SJunchao Zhang /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine 24640e23c03SJunchao Zhang 24740e23c03SJunchao Zhang Arguments: 24840e23c03SJunchao Zhang +Action Unpack or Fetch 24940e23c03SJunchao Zhang .locname Max or Min 25040e23c03SJunchao Zhang .type1 Type of the first data in a pair type 25140e23c03SJunchao Zhang .type2 Type of the second data in a pair type, usually PetscMPIInt for MPI ranks. 25240e23c03SJunchao Zhang .op > or < 25340e23c03SJunchao Zhang .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 254*b23bfdefSJunchao Zhang .CType Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2) 255*b23bfdefSJunchao Zhang -Cvoid void with or without the const qualifier, i.e., const void or void 25640e23c03SJunchao Zhang */ 257*b23bfdefSJunchao Zhang #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \ 258*b23bfdefSJunchao Zhang static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscInt bs,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \ 259*b23bfdefSJunchao Zhang PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked; \ 260*b23bfdefSJunchao Zhang CType *p = (CType*)packed; \ 261*b23bfdefSJunchao Zhang PetscInt i,j; \ 262*b23bfdefSJunchao Zhang for (i=0; i<count; i++) { \ 263*b23bfdefSJunchao Zhang j = idx? idx[i] : i; \ 264*b23bfdefSJunchao Zhang FILTER(PairType(Type1,Type2) v = u[j]); \ 26540e23c03SJunchao Zhang if (p[i].a op u[j].a) { \ 26640e23c03SJunchao Zhang u[j] = p[i]; \ 26740e23c03SJunchao Zhang } else if (p[i].a == u[j].a) { \ 26840e23c03SJunchao Zhang u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */ \ 26940e23c03SJunchao Zhang } \ 27040e23c03SJunchao Zhang FILTER(p[i] = v); \ 27140e23c03SJunchao Zhang } \ 27240e23c03SJunchao Zhang PetscFunctionReturn(0); \ 27340e23c03SJunchao Zhang } 27440e23c03SJunchao Zhang 275*b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */ 276*b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ) \ 277*b23bfdefSJunchao Zhang DEF_PackFunc(Type,BS,EQ) \ 278*b23bfdefSJunchao Zhang DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void) \ 279*b23bfdefSJunchao Zhang DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void) \ 280*b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) { \ 281*b23bfdefSJunchao Zhang link->Pack = CPPJoin4(Pack, Type,BS,EQ); \ 282*b23bfdefSJunchao Zhang link->UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 283*b23bfdefSJunchao Zhang link->FetchAndInsert = CPPJoin4(FetchAndInsert, Type,BS,EQ); \ 284*b23bfdefSJunchao Zhang link->unitbytes = sizeof(Type); \ 28540e23c03SJunchao Zhang } 28640e23c03SJunchao Zhang 287*b23bfdefSJunchao Zhang /* Add, Mult ops */ 288*b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ) \ 289*b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void) \ 290*b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void) \ 291*b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void) \ 292*b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void) \ 293*b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) { \ 294*b23bfdefSJunchao Zhang link->UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 295*b23bfdefSJunchao Zhang link->UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 296*b23bfdefSJunchao Zhang link->FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 297*b23bfdefSJunchao Zhang link->FetchAndMult = CPPJoin4(FetchAndMult, Type,BS,EQ); \ 29840e23c03SJunchao Zhang } 29940e23c03SJunchao Zhang 300*b23bfdefSJunchao Zhang /* Max, Min ops */ 301*b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ) \ 302*b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void) \ 303*b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void) \ 304*b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void) \ 305*b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void) \ 306*b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) { \ 307*b23bfdefSJunchao Zhang link->UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 308*b23bfdefSJunchao Zhang link->UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 309*b23bfdefSJunchao Zhang link->FetchAndMax = CPPJoin4(FetchAndMax , Type,BS,EQ); \ 310*b23bfdefSJunchao Zhang link->FetchAndMin = CPPJoin4(FetchAndMin , Type,BS,EQ); \ 311*b23bfdefSJunchao Zhang } 312*b23bfdefSJunchao Zhang 313*b23bfdefSJunchao Zhang /* Logical ops. 314*b23bfdefSJunchao Zhang The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid 31540e23c03SJunchao Zhang the compilation warning "empty macro arguments are undefined in ISO C90" 31640e23c03SJunchao Zhang */ 317*b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ) \ 318*b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void) \ 319*b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void) \ 320*b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP, IGNORE,const Type,const void) \ 321*b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void) \ 322*b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void) \ 323*b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP, EXECUTE,Type,void) \ 324*b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) { \ 325*b23bfdefSJunchao Zhang link->UnpackAndLAND = CPPJoin4(UnpackAndLAND,Type,BS,EQ); \ 326*b23bfdefSJunchao Zhang link->UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 327*b23bfdefSJunchao Zhang link->UnpackAndLXOR = CPPJoin4(UnpackAndLXOR,Type,BS,EQ); \ 328*b23bfdefSJunchao Zhang link->FetchAndLAND = CPPJoin4(FetchAndLAND, Type,BS,EQ); \ 329*b23bfdefSJunchao Zhang link->FetchAndLOR = CPPJoin4(FetchAndLOR, Type,BS,EQ); \ 330*b23bfdefSJunchao Zhang link->FetchAndLXOR = CPPJoin4(FetchAndLXOR, Type,BS,EQ); \ 33140e23c03SJunchao Zhang } 33240e23c03SJunchao Zhang 333*b23bfdefSJunchao Zhang /* Bitwise ops */ 334*b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ) \ 335*b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void) \ 336*b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void) \ 337*b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void) \ 338*b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void) \ 339*b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void) \ 340*b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void) \ 341*b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) { \ 342*b23bfdefSJunchao Zhang link->UnpackAndBAND = CPPJoin4(UnpackAndBAND,Type,BS,EQ); \ 343*b23bfdefSJunchao Zhang link->UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 344*b23bfdefSJunchao Zhang link->UnpackAndBXOR = CPPJoin4(UnpackAndBXOR,Type,BS,EQ); \ 345*b23bfdefSJunchao Zhang link->FetchAndBAND = CPPJoin4(FetchAndBAND, Type,BS,EQ); \ 346*b23bfdefSJunchao Zhang link->FetchAndBOR = CPPJoin4(FetchAndBOR, Type,BS,EQ); \ 347*b23bfdefSJunchao Zhang link->FetchAndBXOR = CPPJoin4(FetchAndBXOR, Type,BS,EQ); \ 34840e23c03SJunchao Zhang } 34940e23c03SJunchao Zhang 350*b23bfdefSJunchao Zhang /* Maxloc, Minloc */ 351*b23bfdefSJunchao Zhang #define DEF_Xloc(Type1,Type2) \ 352*b23bfdefSJunchao Zhang DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void) \ 353*b23bfdefSJunchao Zhang DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void) \ 354*b23bfdefSJunchao Zhang DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void) \ 355*b23bfdefSJunchao Zhang DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void) \ 356*b23bfdefSJunchao Zhang static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) { \ 357*b23bfdefSJunchao Zhang link->UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1); \ 358*b23bfdefSJunchao Zhang link->UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1); \ 359*b23bfdefSJunchao Zhang link->FetchAndMaxloc = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1); \ 360*b23bfdefSJunchao Zhang link->FetchAndMinloc = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1); \ 36140e23c03SJunchao Zhang } 36240e23c03SJunchao Zhang 363*b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ) \ 364*b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 365*b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 366*b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 367*b23bfdefSJunchao Zhang DEF_Log(Type,BS,EQ) \ 368*b23bfdefSJunchao Zhang DEF_Bit(Type,BS,EQ) \ 369*b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) { \ 370*b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 371*b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 372*b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 373*b23bfdefSJunchao Zhang CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 374*b23bfdefSJunchao Zhang CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 37540e23c03SJunchao Zhang } 37640e23c03SJunchao Zhang 377*b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ) \ 378*b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 379*b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 380*b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 381*b23bfdefSJunchao Zhang DEF_Log(Type,BS,EQ) \ 382*b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) { \ 383*b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 384*b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 385*b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 386*b23bfdefSJunchao Zhang CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 387*b23bfdefSJunchao Zhang } 38840e23c03SJunchao Zhang 38940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 390*b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ) \ 391*b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 392*b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 393*b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) { \ 394*b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 395*b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 396*b23bfdefSJunchao Zhang } 39740e23c03SJunchao Zhang #endif 398*b23bfdefSJunchao Zhang 399*b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ) \ 400*b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 401*b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) { \ 402*b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 403*b23bfdefSJunchao Zhang } 404*b23bfdefSJunchao Zhang 405*b23bfdefSJunchao Zhang /* Maxloc, Minloc */ 406*b23bfdefSJunchao Zhang #define DEF_PairType(Type1,Type2) \ 407*b23bfdefSJunchao Zhang typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2); \ 408*b23bfdefSJunchao Zhang DEF_Pack(PairType(Type1,Type2),1,1) \ 409*b23bfdefSJunchao Zhang DEF_Xloc(Type1,Type2) \ 410*b23bfdefSJunchao Zhang static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) { \ 411*b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link); \ 412*b23bfdefSJunchao Zhang CPPJoin3(PackInit_Xloc,Type1,Type2)(link); \ 413*b23bfdefSJunchao Zhang } 414*b23bfdefSJunchao Zhang 415*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 416*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 417*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 418*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 419*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 420*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 421*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 422*b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 423*b23bfdefSJunchao Zhang 424*b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 425*b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1) 426*b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1) 427*b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1) 428*b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1) 429*b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0) 430*b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0) 431*b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0) 432*b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0) 433*b23bfdefSJunchao Zhang #endif 434*b23bfdefSJunchao Zhang 435*b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */ 436*b23bfdefSJunchao Zhang typedef signed char SignedChar; 437*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1) 438*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1) 439*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1) 440*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1) 441*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0) 442*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0) 443*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0) 444*b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0) 445*b23bfdefSJunchao Zhang 446*b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar; 447*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1) 448*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1) 449*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1) 450*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1) 451*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0) 452*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0) 453*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0) 454*b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0) 455*b23bfdefSJunchao Zhang 456*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1) 457*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1) 458*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1) 459*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1) 460*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0) 461*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0) 462*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0) 463*b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0) 464*b23bfdefSJunchao Zhang 465*b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 466*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1) 467*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1) 468*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1) 469*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1) 470*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0) 471*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0) 472*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0) 473*b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0) 474*b23bfdefSJunchao Zhang #endif 475*b23bfdefSJunchao Zhang 476*b23bfdefSJunchao Zhang DEF_PairType(int,int) 477*b23bfdefSJunchao Zhang DEF_PairType(PetscInt,PetscInt) 478*b23bfdefSJunchao Zhang 479*b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */ 480*b23bfdefSJunchao Zhang DEF_DumbType(char,1,1) 481*b23bfdefSJunchao Zhang DEF_DumbType(char,2,1) 482*b23bfdefSJunchao Zhang DEF_DumbType(char,4,1) 483*b23bfdefSJunchao Zhang DEF_DumbType(char,1,0) 484*b23bfdefSJunchao Zhang DEF_DumbType(char,2,0) 485*b23bfdefSJunchao Zhang DEF_DumbType(char,4,0) 486*b23bfdefSJunchao Zhang 487*b23bfdefSJunchao Zhang typedef int DumbInt; /* To differentiate with regular int used above */ 488*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1) 489*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1) 490*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1) 491*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1) 492*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0) 493*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0) 494*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0) 495*b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0) 49640e23c03SJunchao Zhang 49740e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 49840e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 49940e23c03SJunchao Zhang { 50040e23c03SJunchao Zhang int ierr; 50140e23c03SJunchao Zhang ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 50240e23c03SJunchao Zhang ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 50340e23c03SJunchao Zhang return MPI_SUCCESS; 50440e23c03SJunchao Zhang } 50540e23c03SJunchao Zhang #endif 50640e23c03SJunchao Zhang 5079d1c8addSJunchao Zhang PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink) 50840e23c03SJunchao Zhang { 50940e23c03SJunchao Zhang PetscErrorCode ierr; 51040e23c03SJunchao Zhang PetscSFPack link,*p; 51140e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 51240e23c03SJunchao Zhang 51340e23c03SJunchao Zhang PetscFunctionBegin; 51440e23c03SJunchao Zhang /* Look for types in cache */ 51540e23c03SJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 51640e23c03SJunchao Zhang PetscBool match; 51740e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 5189d1c8addSJunchao Zhang if (match && (rkey == link->rkey) && (lkey == link->lkey)) { 51940e23c03SJunchao Zhang switch (cmode) { 52040e23c03SJunchao Zhang case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 52140e23c03SJunchao Zhang case PETSC_USE_POINTER: break; 52240e23c03SJunchao Zhang default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 52340e23c03SJunchao Zhang } 52440e23c03SJunchao Zhang *mylink = link; 52540e23c03SJunchao Zhang PetscFunctionReturn(0); 52640e23c03SJunchao Zhang } 52740e23c03SJunchao Zhang } 52840e23c03SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 52940e23c03SJunchao Zhang PetscFunctionReturn(0); 53040e23c03SJunchao Zhang } 53140e23c03SJunchao Zhang 53240e23c03SJunchao Zhang PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link) 53340e23c03SJunchao Zhang { 53440e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 53540e23c03SJunchao Zhang 53640e23c03SJunchao Zhang PetscFunctionBegin; 5379d1c8addSJunchao Zhang (*link)->rkey = NULL; 5389d1c8addSJunchao Zhang (*link)->lkey = NULL; 53940e23c03SJunchao Zhang (*link)->next = bas->avail; 54040e23c03SJunchao Zhang bas->avail = *link; 54140e23c03SJunchao Zhang *link = NULL; 54240e23c03SJunchao Zhang PetscFunctionReturn(0); 54340e23c03SJunchao Zhang } 54440e23c03SJunchao Zhang 5459d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */ 5469d1c8addSJunchao Zhang PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey) 5479d1c8addSJunchao Zhang { 5489d1c8addSJunchao Zhang PetscErrorCode ierr; 5499d1c8addSJunchao Zhang PetscSFPack link,*p; 5509d1c8addSJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 5519d1c8addSJunchao Zhang PetscBool match; 5529d1c8addSJunchao Zhang 5539d1c8addSJunchao Zhang PetscFunctionBegin; 55418fb5014SJunchao Zhang /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 55518fb5014SJunchao Zhang the potential overlapping since this process does not participate in communication. Overlapping is harmless. 55618fb5014SJunchao Zhang */ 55718fb5014SJunchao Zhang if (rkey || lkey) { 5589d1c8addSJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 5599d1c8addSJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 56033c49614SJunchao Zhang if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey); 5619d1c8addSJunchao Zhang } 56218fb5014SJunchao Zhang } 5639d1c8addSJunchao Zhang PetscFunctionReturn(0); 5649d1c8addSJunchao Zhang } 5659d1c8addSJunchao Zhang 56640e23c03SJunchao Zhang PetscErrorCode PetscSFPackSetupType(PetscSFPack link,MPI_Datatype unit) 56740e23c03SJunchao Zhang { 56840e23c03SJunchao Zhang PetscErrorCode ierr; 569*b23bfdefSJunchao Zhang PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 570*b23bfdefSJunchao Zhang PetscBool is2Int,is2PetscInt; 57140e23c03SJunchao Zhang PetscMPIInt ni,na,nd,combiner; 57240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 573*b23bfdefSJunchao Zhang PetscInt nPetscComplex=0; 57440e23c03SJunchao Zhang #endif 57540e23c03SJunchao Zhang 57640e23c03SJunchao Zhang PetscFunctionBegin; 577*b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 578*b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 579*b23bfdefSJunchao Zhang /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 580*b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 581*b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 582*b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 58340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 584*b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 58540e23c03SJunchao Zhang #endif 58640e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 58740e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 588*b23bfdefSJunchao Zhang /* TODO: shaell we also handle Fortran MPI_2REAL? */ 58940e23c03SJunchao Zhang ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 59040e23c03SJunchao Zhang link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; 591*b23bfdefSJunchao Zhang link->bs = 1; /* default */ 59240e23c03SJunchao Zhang 593*b23bfdefSJunchao Zhang if (nPetscReal) { 594*b23bfdefSJunchao Zhang if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 595*b23bfdefSJunchao Zhang else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 596*b23bfdefSJunchao Zhang else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 597*b23bfdefSJunchao Zhang else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 598*b23bfdefSJunchao Zhang link->bs = nPetscReal; 599*b23bfdefSJunchao Zhang link->unitbytes *= nPetscReal; 60040e23c03SJunchao Zhang link->basicunit = MPIU_REAL; 601*b23bfdefSJunchao Zhang } else if (nPetscInt) { 602*b23bfdefSJunchao Zhang if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 603*b23bfdefSJunchao Zhang else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 604*b23bfdefSJunchao Zhang else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 605*b23bfdefSJunchao Zhang else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 606*b23bfdefSJunchao Zhang link->bs = nPetscInt; 607*b23bfdefSJunchao Zhang link->unitbytes *= nPetscInt; 608*b23bfdefSJunchao Zhang link->basicunit = MPIU_INT; 609*b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) 610*b23bfdefSJunchao Zhang } else if (nInt) { 611*b23bfdefSJunchao Zhang if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 612*b23bfdefSJunchao Zhang else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 613*b23bfdefSJunchao Zhang else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 614*b23bfdefSJunchao Zhang else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 615*b23bfdefSJunchao Zhang link->bs = nInt; 616*b23bfdefSJunchao Zhang link->unitbytes *= nInt; 617*b23bfdefSJunchao Zhang link->basicunit = MPI_INT; 618*b23bfdefSJunchao Zhang #endif 619*b23bfdefSJunchao Zhang } else if (nSignedChar) { 620*b23bfdefSJunchao Zhang if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 621*b23bfdefSJunchao Zhang else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 622*b23bfdefSJunchao Zhang else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 623*b23bfdefSJunchao Zhang else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 624*b23bfdefSJunchao Zhang link->bs = nSignedChar; 625*b23bfdefSJunchao Zhang link->unitbytes *= nSignedChar; 626*b23bfdefSJunchao Zhang link->basicunit = MPI_SIGNED_CHAR; 627*b23bfdefSJunchao Zhang } else if (nUnsignedChar) { 628*b23bfdefSJunchao Zhang if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 629*b23bfdefSJunchao Zhang else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 630*b23bfdefSJunchao Zhang else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 631*b23bfdefSJunchao Zhang else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 632*b23bfdefSJunchao Zhang link->bs = nUnsignedChar; 633*b23bfdefSJunchao Zhang link->unitbytes *= nUnsignedChar; 634*b23bfdefSJunchao Zhang link->basicunit = MPI_UNSIGNED_CHAR; 63540e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 636*b23bfdefSJunchao Zhang } else if (nPetscComplex) { 637*b23bfdefSJunchao Zhang if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 638*b23bfdefSJunchao Zhang else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 639*b23bfdefSJunchao Zhang else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 640*b23bfdefSJunchao Zhang else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 641*b23bfdefSJunchao Zhang link->bs = nPetscComplex; 642*b23bfdefSJunchao Zhang link->unitbytes *= nPetscComplex; 64340e23c03SJunchao Zhang link->basicunit = MPIU_COMPLEX; 64440e23c03SJunchao Zhang #endif 645*b23bfdefSJunchao Zhang } else if (is2Int) { 646*b23bfdefSJunchao Zhang PackInit_PairType_int_int(link); 647*b23bfdefSJunchao Zhang link->bs = 1; 648*b23bfdefSJunchao Zhang link->basicunit = MPI_2INT; 649*b23bfdefSJunchao Zhang } else if (is2PetscInt) { 650*b23bfdefSJunchao Zhang PackInit_PairType_PetscInt_PetscInt(link); 651*b23bfdefSJunchao Zhang link->bs = 1; 652*b23bfdefSJunchao Zhang link->basicunit = MPIU_2INT; 65340e23c03SJunchao Zhang } else { 654*b23bfdefSJunchao Zhang MPI_Aint lb,nbyte; 655*b23bfdefSJunchao Zhang ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 65640e23c03SJunchao Zhang if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 657*b23bfdefSJunchao Zhang if (nbyte % sizeof(DumbInt)) { /* If the type size is not multiple of int */ 658*b23bfdefSJunchao Zhang if (nbyte == 4) {PackInit_DumbType_char_4_1(link); link->bs = nbyte/4;} else if (nbyte%4 == 0) {PackInit_DumbType_char_4_0(link); link->bs = nbyte/4;} /* Note the basic type is char[4] */ 659*b23bfdefSJunchao Zhang else if (nbyte == 2) {PackInit_DumbType_char_2_1(link); link->bs = nbyte/2;} else if (nbyte%2 == 0) {PackInit_DumbType_char_2_0(link); link->bs = nbyte/2;} 660*b23bfdefSJunchao Zhang else if (nbyte == 1) {PackInit_DumbType_char_1_1(link); link->bs = nbyte/1;} else if (nbyte%1 == 0) {PackInit_DumbType_char_1_0(link); link->bs = nbyte/1;} 661*b23bfdefSJunchao Zhang link->unitbytes = nbyte; 662*b23bfdefSJunchao Zhang link->basicunit = MPI_BYTE; 66340e23c03SJunchao Zhang } else { 664*b23bfdefSJunchao Zhang nInt = nbyte / sizeof(DumbInt); 665*b23bfdefSJunchao Zhang if (nInt == 8) {PackInit_DumbType_DumbInt_8_1(link); link->bs = nInt/8;} else if (nInt%8 == 0) {PackInit_DumbType_DumbInt_8_0(link); link->bs = nInt/8;} 666*b23bfdefSJunchao Zhang else if (nInt == 4) {PackInit_DumbType_DumbInt_4_1(link); link->bs = nInt/4;} else if (nInt%4 == 0) {PackInit_DumbType_DumbInt_4_0(link); link->bs = nInt/4;} 667*b23bfdefSJunchao Zhang else if (nInt == 2) {PackInit_DumbType_DumbInt_2_1(link); link->bs = nInt/2;} else if (nInt%2 == 0) {PackInit_DumbType_DumbInt_2_0(link); link->bs = nInt/2;} 668*b23bfdefSJunchao Zhang else if (nInt == 1) {PackInit_DumbType_DumbInt_1_1(link); link->bs = nInt/1;} else if (nInt%1 == 0) {PackInit_DumbType_DumbInt_1_0(link); link->bs = nInt/1;} 669*b23bfdefSJunchao Zhang link->unitbytes = nbyte; 67040e23c03SJunchao Zhang link->basicunit = MPI_INT; 67140e23c03SJunchao Zhang } 67240e23c03SJunchao Zhang } 673*b23bfdefSJunchao Zhang 67440e23c03SJunchao Zhang if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */ 67540e23c03SJunchao Zhang else {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 67640e23c03SJunchao Zhang PetscFunctionReturn(0); 67740e23c03SJunchao Zhang } 67840e23c03SJunchao Zhang 679*b23bfdefSJunchao Zhang PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*)) 68040e23c03SJunchao Zhang { 68140e23c03SJunchao Zhang PetscFunctionBegin; 68240e23c03SJunchao Zhang *UnpackAndOp = NULL; 68340e23c03SJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->UnpackAndInsert; 68440e23c03SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->UnpackAndAdd; 68540e23c03SJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->UnpackAndMult; 68640e23c03SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->UnpackAndMax; 68740e23c03SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->UnpackAndMin; 68840e23c03SJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->UnpackAndLAND; 68940e23c03SJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->UnpackAndBAND; 69040e23c03SJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->UnpackAndLOR; 69140e23c03SJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->UnpackAndBOR; 69240e23c03SJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->UnpackAndLXOR; 69340e23c03SJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->UnpackAndBXOR; 69440e23c03SJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->UnpackAndMaxloc; 69540e23c03SJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->UnpackAndMinloc; 69640e23c03SJunchao Zhang else *UnpackAndOp = NULL; 69740e23c03SJunchao Zhang PetscFunctionReturn(0); 69840e23c03SJunchao Zhang } 69940e23c03SJunchao Zhang 700*b23bfdefSJunchao Zhang PetscErrorCode PetscSFPackGetFetchAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*)) 70140e23c03SJunchao Zhang { 70240e23c03SJunchao Zhang PetscFunctionBegin; 70340e23c03SJunchao Zhang *FetchAndOp = NULL; 70440e23c03SJunchao Zhang if (op == MPIU_REPLACE) *FetchAndOp = link->FetchAndInsert; 70540e23c03SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->FetchAndAdd; 70640e23c03SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->FetchAndMax; 70740e23c03SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->FetchAndMin; 70840e23c03SJunchao Zhang else if (op == MPI_MAXLOC) *FetchAndOp = link->FetchAndMaxloc; 70940e23c03SJunchao Zhang else if (op == MPI_MINLOC) *FetchAndOp = link->FetchAndMinloc; 71040e23c03SJunchao Zhang else if (op == MPI_PROD) *FetchAndOp = link->FetchAndMult; 71140e23c03SJunchao Zhang else if (op == MPI_LAND) *FetchAndOp = link->FetchAndLAND; 71240e23c03SJunchao Zhang else if (op == MPI_BAND) *FetchAndOp = link->FetchAndBAND; 71340e23c03SJunchao Zhang else if (op == MPI_LOR) *FetchAndOp = link->FetchAndLOR; 71440e23c03SJunchao Zhang else if (op == MPI_BOR) *FetchAndOp = link->FetchAndBOR; 71540e23c03SJunchao Zhang else if (op == MPI_LXOR) *FetchAndOp = link->FetchAndLXOR; 71640e23c03SJunchao Zhang else if (op == MPI_BXOR) *FetchAndOp = link->FetchAndBXOR; 71740e23c03SJunchao Zhang else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for MPI_Op"); 71840e23c03SJunchao Zhang PetscFunctionReturn(0); 71940e23c03SJunchao Zhang } 72040e23c03SJunchao Zhang 72140e23c03SJunchao Zhang /* 72240e23c03SJunchao Zhang Setup pack/unpack optimization plans based on indice patterns available 72340e23c03SJunchao Zhang 72440e23c03SJunchao Zhang Input Parameters: 725*b23bfdefSJunchao Zhang + n - Number of target ranks 726*b23bfdefSJunchao Zhang . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] is not necessarily 0. 727*b23bfdefSJunchao Zhang - idx - [*] Array storing indices 72840e23c03SJunchao Zhang 72940e23c03SJunchao Zhang Output Parameters: 73040e23c03SJunchao Zhang + opt - the optimization 73140e23c03SJunchao Zhang */ 73240e23c03SJunchao Zhang PetscErrorCode PetscSFPackSetupOptimization(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 73340e23c03SJunchao Zhang { 73440e23c03SJunchao Zhang PetscErrorCode ierr; 73540e23c03SJunchao Zhang PetscInt i,j,k,n_copies,tot_copies=0,step; 736*b23bfdefSJunchao Zhang PetscBool strided,optimized=PETSC_FALSE; 73740e23c03SJunchao Zhang PetscSFPackOpt opt; 73840e23c03SJunchao Zhang 73940e23c03SJunchao Zhang PetscFunctionBegin; 740*b23bfdefSJunchao Zhang if (!n) { 741*b23bfdefSJunchao Zhang *out = NULL; 742*b23bfdefSJunchao Zhang PetscFunctionReturn(0); 743*b23bfdefSJunchao Zhang } 744*b23bfdefSJunchao Zhang 74540e23c03SJunchao Zhang ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr); 746*b23bfdefSJunchao Zhang ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr); 747*b23bfdefSJunchao Zhang ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr); 748*b23bfdefSJunchao Zhang if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/ 749*b23bfdefSJunchao Zhang 750*b23bfdefSJunchao Zhang opt->n = n; 75140e23c03SJunchao Zhang 75240e23c03SJunchao Zhang /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */ 75340e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each target processor */ 75440e23c03SJunchao Zhang /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */ 75540e23c03SJunchao Zhang n_copies = 1; 75640e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 75740e23c03SJunchao Zhang if (idx[j]+1 != idx[j+1]) n_copies++; 75840e23c03SJunchao Zhang } 75940e23c03SJunchao Zhang /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32, 76040e23c03SJunchao Zhang then it is worth using memcpy for this target. 32 is an arbitrarily chosen number. 76140e23c03SJunchao Zhang */ 76240e23c03SJunchao Zhang if ((offset[i+1]-offset[i])/n_copies >= 32) { 763*b23bfdefSJunchao Zhang opt->type[i] = PETSCSF_PACKOPT_MULTICOPY; 764*b23bfdefSJunchao Zhang optimized = PETSC_TRUE; 76540e23c03SJunchao Zhang tot_copies += n_copies; 76640e23c03SJunchao Zhang } 76740e23c03SJunchao Zhang } 76840e23c03SJunchao Zhang 76940e23c03SJunchao Zhang /* Setup memcpy plan for each contiguous piece */ 77040e23c03SJunchao Zhang k = 0; /* k-th copy */ 771*b23bfdefSJunchao Zhang ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr); 772*b23bfdefSJunchao Zhang for (i=0; i<n; i++) { /* for each target processor */ 773*b23bfdefSJunchao Zhang if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) { 77440e23c03SJunchao Zhang n_copies = 1; 775*b23bfdefSJunchao Zhang opt->copy_start[k] = offset[i] - offset[0]; 77640e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 77740e23c03SJunchao Zhang if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */ 77840e23c03SJunchao Zhang n_copies++; 779*b23bfdefSJunchao Zhang opt->copy_start[k+1] = j-offset[0]+1; 780*b23bfdefSJunchao Zhang opt->copy_length[k] = opt->copy_start[k+1] - opt->copy_start[k]; 78140e23c03SJunchao Zhang k++; 78240e23c03SJunchao Zhang } 78340e23c03SJunchao Zhang } 78440e23c03SJunchao Zhang /* Set copy length of the last copy for this target */ 785*b23bfdefSJunchao Zhang opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k]; 78640e23c03SJunchao Zhang k++; 78740e23c03SJunchao Zhang } 788*b23bfdefSJunchao Zhang /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */ 78940e23c03SJunchao Zhang opt->copy_offset[i+1] = k; 79040e23c03SJunchao Zhang } 79140e23c03SJunchao Zhang 79240e23c03SJunchao Zhang /* Last chance! If the indices do not have long contiguous pieces, are they strided? */ 79340e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each remote */ 794*b23bfdefSJunchao Zhang if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */ 79540e23c03SJunchao Zhang strided = PETSC_TRUE; 79640e23c03SJunchao Zhang step = idx[offset[i]+1] - idx[offset[i]]; 79740e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 79840e23c03SJunchao Zhang if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; } 79940e23c03SJunchao Zhang } 80040e23c03SJunchao Zhang if (strided) { 801*b23bfdefSJunchao Zhang opt->type[i] = PETSCSF_PACKOPT_STRIDE; 80240e23c03SJunchao Zhang opt->stride_step[i] = step; 80340e23c03SJunchao Zhang opt->stride_n[i] = offset[i+1] - offset[i]; 804*b23bfdefSJunchao Zhang optimized = PETSC_TRUE; 80540e23c03SJunchao Zhang } 80640e23c03SJunchao Zhang } 80740e23c03SJunchao Zhang } 808*b23bfdefSJunchao Zhang /* If no rank gets optimized, free arrays to save memory */ 809*b23bfdefSJunchao Zhang if (!optimized) { 810*b23bfdefSJunchao Zhang ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 811*b23bfdefSJunchao Zhang ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 81240e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 81340e23c03SJunchao Zhang *out = NULL; 81440e23c03SJunchao Zhang } else *out = opt; 81540e23c03SJunchao Zhang PetscFunctionReturn(0); 81640e23c03SJunchao Zhang } 81740e23c03SJunchao Zhang 81840e23c03SJunchao Zhang PetscErrorCode PetscSFPackDestoryOptimization(PetscSFPackOpt *out) 81940e23c03SJunchao Zhang { 82040e23c03SJunchao Zhang PetscErrorCode ierr; 82140e23c03SJunchao Zhang PetscSFPackOpt opt = *out; 82240e23c03SJunchao Zhang 82340e23c03SJunchao Zhang PetscFunctionBegin; 82440e23c03SJunchao Zhang if (opt) { 825*b23bfdefSJunchao Zhang ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 826*b23bfdefSJunchao Zhang ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 82740e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 82840e23c03SJunchao Zhang *out = NULL; 82940e23c03SJunchao Zhang } 83040e23c03SJunchao Zhang PetscFunctionReturn(0); 83140e23c03SJunchao Zhang } 832