1*40e23c03SJunchao Zhang 2*40e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 3*40e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4*40e23c03SJunchao Zhang 5*40e23c03SJunchao Zhang /* 6*40e23c03SJunchao Zhang * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 7*40e23c03SJunchao Zhang * therefore we pack data types manually. This file defines packing routines for the standard data types. 8*40e23c03SJunchao Zhang */ 9*40e23c03SJunchao Zhang 10*40e23c03SJunchao Zhang #define CPPJoin2_exp(a,b) a ## b 11*40e23c03SJunchao Zhang #define CPPJoin2(a,b) CPPJoin2_exp(a,b) 12*40e23c03SJunchao Zhang #define CPPJoin3_exp_(a,b,c) a ## b ## _ ## c 13*40e23c03SJunchao Zhang #define CPPJoin3_(a,b,c) CPPJoin3_exp_(a,b,c) 14*40e23c03SJunchao Zhang 15*40e23c03SJunchao Zhang #define EXECUTE(statement) statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */ 16*40e23c03SJunchao Zhang #define IGNORE(statement) do {} while(0) 17*40e23c03SJunchao Zhang 18*40e23c03SJunchao Zhang #define BINARY_OP(r,s,op,t) do {(r) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 19*40e23c03SJunchao Zhang #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 20*40e23c03SJunchao Zhang #define LXOR_OP(r,s,op,t) do {(r) = (!s) != (!t);} while(0) /* logical exclusive OR */ 21*40e23c03SJunchao Zhang #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0) 22*40e23c03SJunchao Zhang 23*40e23c03SJunchao Zhang #define BlockType(type,count) CPPJoin3_(_blocktype_,type,count) /* typename for struct {type v[count];} */ 24*40e23c03SJunchao Zhang #define PairType(type1,type2) CPPJoin3_(_pairtype_,type1,type2) /* typename for struct {type1 a; type2 b;} */ 25*40e23c03SJunchao Zhang 26*40e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine 27*40e23c03SJunchao Zhang 28*40e23c03SJunchao Zhang Arguments of the macro: 29*40e23c03SJunchao Zhang +type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 30*40e23c03SJunchao Zhang -BS Block size for vectorization. It is a factor of bs. 31*40e23c03SJunchao Zhang 32*40e23c03SJunchao Zhang Arguments of the Pack routine: 33*40e23c03SJunchao Zhang +n Number of entries to pack. Each entry is of type 'unit'. Here the unit is the arg used in calls like PetscSFBcastBegin(sf,unit,..). 34*40e23c03SJunchao Zhang If idx in not NULL, then n also indicates the number of indices in idx[] 35*40e23c03SJunchao Zhang .bs Number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int 36*40e23c03SJunchao Zhang .idx Indices of entries. NULL means contiguous indices [0,n) 37*40e23c03SJunchao Zhang .r Do packing for the r-th target processor 38*40e23c03SJunchao Zhang .opt Pack optimization plans. NULL means no plan. 39*40e23c03SJunchao Zhang .unpacked Address of the unpacked data 40*40e23c03SJunchao Zhang -packed Address of the packed data 41*40e23c03SJunchao Zhang */ 42*40e23c03SJunchao Zhang #define DEF_PackFunc(type,BS) \ 43*40e23c03SJunchao Zhang static PetscErrorCode CPPJoin3_(Pack_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,const void *unpacked,void *packed) { \ 44*40e23c03SJunchao Zhang PetscErrorCode ierr; \ 45*40e23c03SJunchao Zhang const type *u = (const type*)unpacked,*u2; \ 46*40e23c03SJunchao Zhang type *p = (type*)packed; \ 47*40e23c03SJunchao Zhang PetscInt i,j,k,l,step; \ 48*40e23c03SJunchao Zhang PetscFunctionBegin; \ 49*40e23c03SJunchao Zhang if (!idx) { /* idx[] is contiguous */ \ 50*40e23c03SJunchao Zhang ierr = PetscMemcpy(p,u,sizeof(type)*bs*n);CHKERRQ(ierr); \ 51*40e23c03SJunchao Zhang } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/ \ 52*40e23c03SJunchao Zhang for (i=0; i<n; i++) \ 53*40e23c03SJunchao Zhang for (j=0; j<bs; j+=BS) \ 54*40e23c03SJunchao Zhang for (k=j; k<j+BS; k++) \ 55*40e23c03SJunchao Zhang p[i*bs+k] = u[idx[i]*bs+k]; \ 56*40e23c03SJunchao Zhang } else { /* idx[] is optimized*/ \ 57*40e23c03SJunchao Zhang if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */ \ 58*40e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \ 59*40e23c03SJunchao Zhang l = opt->copy_length[i]*bs; /* length in types */ \ 60*40e23c03SJunchao Zhang u2 = u + opt->copy_start[i]*bs; \ 61*40e23c03SJunchao Zhang ierr = PetscMemcpy(p,u2,l*sizeof(type));CHKERRQ(ierr); \ 62*40e23c03SJunchao Zhang p += l; \ 63*40e23c03SJunchao Zhang } \ 64*40e23c03SJunchao Zhang } else { /* idx[] is strided */ \ 65*40e23c03SJunchao Zhang u += opt->stride_first[r]*bs; \ 66*40e23c03SJunchao Zhang step = opt->stride_step[r]; \ 67*40e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 68*40e23c03SJunchao Zhang for (j=0; j<bs; j++) \ 69*40e23c03SJunchao Zhang p[i*bs+j] = u[i*step*bs+j]; \ 70*40e23c03SJunchao Zhang } \ 71*40e23c03SJunchao Zhang } \ 72*40e23c03SJunchao Zhang PetscFunctionReturn(0); \ 73*40e23c03SJunchao Zhang } 74*40e23c03SJunchao Zhang 75*40e23c03SJunchao Zhang /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine 76*40e23c03SJunchao Zhang 77*40e23c03SJunchao Zhang Arguments: 78*40e23c03SJunchao Zhang +action Unpack or Fetch 79*40e23c03SJunchao Zhang .type Type of the data 80*40e23c03SJunchao Zhang .BS Block size for vectorization 81*40e23c03SJunchao Zhang .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 82*40e23c03SJunchao Zhang .ctype Type with or without the const qualifier, i.e., const type or type 83*40e23c03SJunchao Zhang .cvoid void with or without the const qualifier, i.e., const void or void 84*40e23c03SJunchao Zhang 85*40e23c03SJunchao Zhang Notes: 86*40e23c03SJunchao Zhang This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 87*40e23c03SJunchao Zhang The two arguments ctype and cvoid are used (instead of one constness argument), because we want to 88*40e23c03SJunchao Zhang get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument, 89*40e23c03SJunchao Zhang sometimes we input 'const', sometimes we have to input empty. 90*40e23c03SJunchao Zhang */ 91*40e23c03SJunchao Zhang #define DEF_Action(action,type,BS,FILTER,ctype,cvoid) \ 92*40e23c03SJunchao Zhang static PetscErrorCode CPPJoin3_(action##AndInsert_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \ 93*40e23c03SJunchao Zhang PetscErrorCode ierr; \ 94*40e23c03SJunchao Zhang type *u = (type*)unpacked,*u2; \ 95*40e23c03SJunchao Zhang ctype *p = (ctype*)packed; \ 96*40e23c03SJunchao Zhang PetscInt i,j,k,l,step; \ 97*40e23c03SJunchao Zhang PetscFunctionBegin; \ 98*40e23c03SJunchao Zhang if (!idx) { /* idx[] is contiguous */ \ 99*40e23c03SJunchao Zhang FILTER(type *v); \ 100*40e23c03SJunchao Zhang FILTER(ierr = PetscMalloc1(bs*n,&v);CHKERRQ(ierr)); \ 101*40e23c03SJunchao Zhang FILTER(ierr = PetscMemcpy(v,u,sizeof(type)*bs*n);CHKERRQ(ierr)); \ 102*40e23c03SJunchao Zhang ierr = PetscMemcpy(u,p,sizeof(type)*bs*n);CHKERRQ(ierr); \ 103*40e23c03SJunchao Zhang FILTER(ierr = PetscMemcpy(p,v,sizeof(type)*bs*n);CHKERRQ(ierr)); \ 104*40e23c03SJunchao Zhang FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 105*40e23c03SJunchao Zhang } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/ \ 106*40e23c03SJunchao Zhang for (i=0; i<n; i++) { \ 107*40e23c03SJunchao Zhang for (j=0; j<bs; j+=BS) { \ 108*40e23c03SJunchao Zhang for (k=j; k<j+BS; k++) { \ 109*40e23c03SJunchao Zhang FILTER(type t = u[idx[i]*bs+k]); \ 110*40e23c03SJunchao Zhang u[idx[i]*bs+k] = p[i*bs+k]; \ 111*40e23c03SJunchao Zhang FILTER(p[i*bs+k] = t); \ 112*40e23c03SJunchao Zhang } \ 113*40e23c03SJunchao Zhang } \ 114*40e23c03SJunchao Zhang } \ 115*40e23c03SJunchao Zhang } else { /* idx[] is optimized*/ \ 116*40e23c03SJunchao Zhang if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */ \ 117*40e23c03SJunchao Zhang FILTER(type *v); \ 118*40e23c03SJunchao Zhang FILTER(ierr = PetscMalloc1(bs*n,&v);CHKERRQ(ierr)); /* maximal buffer */ \ 119*40e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 120*40e23c03SJunchao Zhang l = opt->copy_length[i]*bs; /* length in types */ \ 121*40e23c03SJunchao Zhang u2 = u + opt->copy_start[i]*bs; \ 122*40e23c03SJunchao Zhang FILTER(ierr = PetscMemcpy(v,u2,l*sizeof(type));CHKERRQ(ierr)); \ 123*40e23c03SJunchao Zhang ierr = PetscMemcpy(u2,p,l*sizeof(type));CHKERRQ(ierr); \ 124*40e23c03SJunchao Zhang FILTER(ierr = PetscMemcpy(p,v,l*sizeof(type));CHKERRQ(ierr)); \ 125*40e23c03SJunchao Zhang p += l; \ 126*40e23c03SJunchao Zhang } \ 127*40e23c03SJunchao Zhang FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 128*40e23c03SJunchao Zhang } else { /* idx[] is strided */ \ 129*40e23c03SJunchao Zhang u += opt->stride_first[r]*bs; \ 130*40e23c03SJunchao Zhang step = opt->stride_step[r]; \ 131*40e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 132*40e23c03SJunchao Zhang for (j=0; j<bs; j++) { \ 133*40e23c03SJunchao Zhang FILTER(type t = u[i*step*bs+j]); \ 134*40e23c03SJunchao Zhang u[i*step*bs+j] = p[i*bs+j]; \ 135*40e23c03SJunchao Zhang FILTER(p[i*bs+j] = t); \ 136*40e23c03SJunchao Zhang } \ 137*40e23c03SJunchao Zhang } \ 138*40e23c03SJunchao Zhang } \ 139*40e23c03SJunchao Zhang PetscFunctionReturn(0); \ 140*40e23c03SJunchao Zhang } 141*40e23c03SJunchao Zhang 142*40e23c03SJunchao Zhang /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc 143*40e23c03SJunchao Zhang 144*40e23c03SJunchao Zhang Arguments: 145*40e23c03SJunchao Zhang +action Unpack or Fetch 146*40e23c03SJunchao Zhang .opname Name of the Op, such as Add, Mult, LAND, etc. 147*40e23c03SJunchao Zhang .type Type of the data 148*40e23c03SJunchao Zhang .BS Block size for vectorization 149*40e23c03SJunchao Zhang .op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 150*40e23c03SJunchao Zhang .APPLY Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP 151*40e23c03SJunchao Zhang .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 152*40e23c03SJunchao Zhang .ctype Type with or without the const qualifier, i.e., const type or type 153*40e23c03SJunchao Zhang -cvoid void with or without the const qualifier, i.e., const void or void 154*40e23c03SJunchao Zhang */ 155*40e23c03SJunchao Zhang #define DEF_ActionAndOp(action,opname,type,BS,op,APPLY,FILTER,ctype,cvoid) \ 156*40e23c03SJunchao Zhang static PetscErrorCode CPPJoin3_(action##And##opname##_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \ 157*40e23c03SJunchao Zhang type *u = (type*)unpacked,*u2,t; \ 158*40e23c03SJunchao Zhang ctype *p = (ctype*)packed; \ 159*40e23c03SJunchao Zhang PetscInt i,j,k,l,step; \ 160*40e23c03SJunchao Zhang PetscFunctionBegin; \ 161*40e23c03SJunchao Zhang if (!idx) { /* idx[] is contiguous */ \ 162*40e23c03SJunchao Zhang for (i=0; i<n*bs; i++) { \ 163*40e23c03SJunchao Zhang t = u[i]; \ 164*40e23c03SJunchao Zhang APPLY(u[i],t,op,p[i]); \ 165*40e23c03SJunchao Zhang FILTER(p[i] = t); \ 166*40e23c03SJunchao Zhang } \ 167*40e23c03SJunchao Zhang } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/ \ 168*40e23c03SJunchao Zhang for (i=0; i<n; i++) { \ 169*40e23c03SJunchao Zhang for (j=0; j<bs; j+=BS) { \ 170*40e23c03SJunchao Zhang for (k=j; k<j+BS; k++) { \ 171*40e23c03SJunchao Zhang t = u[idx[i]*bs+k]; \ 172*40e23c03SJunchao Zhang APPLY(u[idx[i]*bs+k],t,op,p[i*bs+k]); \ 173*40e23c03SJunchao Zhang FILTER(p[i*bs+k] = t); \ 174*40e23c03SJunchao Zhang } \ 175*40e23c03SJunchao Zhang } \ 176*40e23c03SJunchao Zhang } \ 177*40e23c03SJunchao Zhang } else { /* idx[] is optimized*/ \ 178*40e23c03SJunchao Zhang if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */ \ 179*40e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 180*40e23c03SJunchao Zhang l = opt->copy_length[i]*bs; /* length in types */ \ 181*40e23c03SJunchao Zhang u2 = u + opt->copy_start[i]*bs; \ 182*40e23c03SJunchao Zhang for (j=0; j<l; j++) { \ 183*40e23c03SJunchao Zhang t = u2[j]; \ 184*40e23c03SJunchao Zhang APPLY(u2[j],t,op,p[j]); \ 185*40e23c03SJunchao Zhang FILTER(p[j] = t); \ 186*40e23c03SJunchao Zhang } \ 187*40e23c03SJunchao Zhang p += l; \ 188*40e23c03SJunchao Zhang } \ 189*40e23c03SJunchao Zhang } else { /* idx[] is strided */ \ 190*40e23c03SJunchao Zhang u += opt->stride_first[r]*bs; \ 191*40e23c03SJunchao Zhang step = opt->stride_step[r]; \ 192*40e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 193*40e23c03SJunchao Zhang for (j=0; j<bs; j++) { \ 194*40e23c03SJunchao Zhang t = u[i*step*bs+j]; \ 195*40e23c03SJunchao Zhang APPLY(u[i*step*bs+j],t,op,p[i*bs+j]); \ 196*40e23c03SJunchao Zhang FILTER(p[i*bs+j] = t); \ 197*40e23c03SJunchao Zhang } \ 198*40e23c03SJunchao Zhang } \ 199*40e23c03SJunchao Zhang } \ 200*40e23c03SJunchao Zhang PetscFunctionReturn(0); \ 201*40e23c03SJunchao Zhang } 202*40e23c03SJunchao Zhang 203*40e23c03SJunchao Zhang /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine 204*40e23c03SJunchao Zhang 205*40e23c03SJunchao Zhang Arguments: 206*40e23c03SJunchao Zhang +Action Unpack or Fetch 207*40e23c03SJunchao Zhang .locname Max or Min 208*40e23c03SJunchao Zhang .type1 Type of the first data in a pair type 209*40e23c03SJunchao Zhang .type2 Type of the second data in a pair type, usually PetscMPIInt for MPI ranks. 210*40e23c03SJunchao Zhang .op > or < 211*40e23c03SJunchao Zhang .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 212*40e23c03SJunchao Zhang .ctype Type with or without the const qualifier, i.e., const PairType(type1,type2) or PairType(type1,type2) 213*40e23c03SJunchao Zhang -cvoid void with or without the const qualifier, i.e., const void or void 214*40e23c03SJunchao Zhang */ 215*40e23c03SJunchao Zhang #define DEF_ActionAndXloc(action,locname,type1,type2,op,FILTER,ctype,cvoid) \ 216*40e23c03SJunchao Zhang static PetscErrorCode CPPJoin3_(action##And##locname##loc_,PairType(type1,type2),1)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \ 217*40e23c03SJunchao Zhang PairType(type1,type2) *u = (PairType(type1,type2)*)unpacked; \ 218*40e23c03SJunchao Zhang ctype *p = (ctype*)packed; \ 219*40e23c03SJunchao Zhang PetscInt i; \ 220*40e23c03SJunchao Zhang for (i=0; i<n; i++) { \ 221*40e23c03SJunchao Zhang PetscInt j = idx[i]; \ 222*40e23c03SJunchao Zhang FILTER(PairType(type1,type2) v = u[j]); \ 223*40e23c03SJunchao Zhang if (p[i].a op u[j].a) { \ 224*40e23c03SJunchao Zhang u[j] = p[i]; \ 225*40e23c03SJunchao Zhang } else if (p[i].a == u[j].a) { \ 226*40e23c03SJunchao Zhang u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */ \ 227*40e23c03SJunchao Zhang } \ 228*40e23c03SJunchao Zhang FILTER(p[i] = v); \ 229*40e23c03SJunchao Zhang } \ 230*40e23c03SJunchao Zhang PetscFunctionReturn(0); \ 231*40e23c03SJunchao Zhang } 232*40e23c03SJunchao Zhang 233*40e23c03SJunchao Zhang 234*40e23c03SJunchao Zhang /* Pack/unpack/fetch ops for all types */ 235*40e23c03SJunchao Zhang #define DEF_PackNoInit(type,BS) \ 236*40e23c03SJunchao Zhang DEF_PackFunc(type,BS) \ 237*40e23c03SJunchao Zhang DEF_Action(Unpack,type,BS,IGNORE,const type,const void) \ 238*40e23c03SJunchao Zhang DEF_Action(Fetch, type,BS,EXECUTE,type,void) \ 239*40e23c03SJunchao Zhang 240*40e23c03SJunchao Zhang 241*40e23c03SJunchao Zhang /* Extra addition ops for types supporting them */ 242*40e23c03SJunchao Zhang #define DEF_PackAddNoInit(type,BS) \ 243*40e23c03SJunchao Zhang DEF_PackNoInit(type,BS) \ 244*40e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,Add, type,BS,+,BINARY_OP,IGNORE,const type,const void) \ 245*40e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,Mult,type,BS,*,BINARY_OP,IGNORE,const type,const void) \ 246*40e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, Add, type,BS,+,BINARY_OP,EXECUTE,type,void) \ 247*40e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, Mult,type,BS,*,BINARY_OP,EXECUTE,type,void) 248*40e23c03SJunchao Zhang 249*40e23c03SJunchao Zhang /* Basic types */ 250*40e23c03SJunchao Zhang #define DEF_Pack(type,BS) \ 251*40e23c03SJunchao Zhang DEF_PackAddNoInit(type,BS) \ 252*40e23c03SJunchao Zhang static void CPPJoin3_(PackInit_,type,BS)(PetscSFPack link) { \ 253*40e23c03SJunchao Zhang link->Pack = CPPJoin3_(Pack_, type,BS); \ 254*40e23c03SJunchao Zhang link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,type,BS); \ 255*40e23c03SJunchao Zhang link->UnpackAndAdd = CPPJoin3_(UnpackAndAdd_, type,BS); \ 256*40e23c03SJunchao Zhang link->UnpackAndMult = CPPJoin3_(UnpackAndMult_, type,BS); \ 257*40e23c03SJunchao Zhang link->FetchAndInsert = CPPJoin3_(FetchAndInsert_, type,BS); \ 258*40e23c03SJunchao Zhang link->FetchAndAdd = CPPJoin3_(FetchAndAdd_, type,BS); \ 259*40e23c03SJunchao Zhang link->FetchAndMult = CPPJoin3_(FetchAndMult_, type,BS); \ 260*40e23c03SJunchao Zhang link->unitbytes = sizeof(type); \ 261*40e23c03SJunchao Zhang } 262*40e23c03SJunchao Zhang 263*40e23c03SJunchao Zhang /* Comparable types */ 264*40e23c03SJunchao Zhang #define DEF_PackCmp(type) \ 265*40e23c03SJunchao Zhang DEF_PackAddNoInit(type,1) \ 266*40e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,Max,type,1,PetscMax,FUNCTION_OP,IGNORE,const type,const void) \ 267*40e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,Min,type,1,PetscMin,FUNCTION_OP,IGNORE,const type,const void) \ 268*40e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, Max,type,1,PetscMax,FUNCTION_OP,EXECUTE,type,void) \ 269*40e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, Min,type,1,PetscMin,FUNCTION_OP,EXECUTE,type,void) \ 270*40e23c03SJunchao Zhang static void CPPJoin2(PackInit_,type)(PetscSFPack link) { \ 271*40e23c03SJunchao Zhang link->Pack = CPPJoin3_(Pack_, type,1); \ 272*40e23c03SJunchao Zhang link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,type,1); \ 273*40e23c03SJunchao Zhang link->UnpackAndAdd = CPPJoin3_(UnpackAndAdd_, type,1); \ 274*40e23c03SJunchao Zhang link->UnpackAndMult = CPPJoin3_(UnpackAndMult_, type,1); \ 275*40e23c03SJunchao Zhang link->UnpackAndMax = CPPJoin3_(UnpackAndMax_, type,1); \ 276*40e23c03SJunchao Zhang link->UnpackAndMin = CPPJoin3_(UnpackAndMin_, type,1); \ 277*40e23c03SJunchao Zhang link->FetchAndInsert = CPPJoin3_(FetchAndInsert_, type,1); \ 278*40e23c03SJunchao Zhang link->FetchAndAdd = CPPJoin3_(FetchAndAdd_ , type,1); \ 279*40e23c03SJunchao Zhang link->FetchAndMult = CPPJoin3_(FetchAndMult_, type,1); \ 280*40e23c03SJunchao Zhang link->FetchAndMax = CPPJoin3_(FetchAndMax_ , type,1); \ 281*40e23c03SJunchao Zhang link->FetchAndMin = CPPJoin3_(FetchAndMin_ , type,1); \ 282*40e23c03SJunchao Zhang link->unitbytes = sizeof(type); \ 283*40e23c03SJunchao Zhang } 284*40e23c03SJunchao Zhang 285*40e23c03SJunchao Zhang /* Logical Types */ 286*40e23c03SJunchao Zhang /* The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid 287*40e23c03SJunchao Zhang the compilation warning "empty macro arguments are undefined in ISO C90" 288*40e23c03SJunchao Zhang */ 289*40e23c03SJunchao Zhang #define DEF_PackLog(type) \ 290*40e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,LAND,type,1,&&,BINARY_OP,IGNORE,const type,const void) \ 291*40e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,LOR, type,1,||,BINARY_OP,IGNORE,const type,const void) \ 292*40e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,LXOR,type,1,&, LXOR_OP, IGNORE,const type,const void) \ 293*40e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, LAND,type,1,&&,BINARY_OP,EXECUTE,type,void) \ 294*40e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, LOR, type,1,||,BINARY_OP,EXECUTE,type,void) \ 295*40e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, LXOR,type,1,&, LXOR_OP, EXECUTE,type,void) \ 296*40e23c03SJunchao Zhang static void CPPJoin2(PackInit_Logical_,type)(PetscSFPack link) { \ 297*40e23c03SJunchao Zhang link->UnpackAndLAND = CPPJoin3_(UnpackAndLAND_,type,1); \ 298*40e23c03SJunchao Zhang link->UnpackAndLOR = CPPJoin3_(UnpackAndLOR_, type,1); \ 299*40e23c03SJunchao Zhang link->UnpackAndLXOR = CPPJoin3_(UnpackAndLXOR_,type,1); \ 300*40e23c03SJunchao Zhang link->FetchAndLAND = CPPJoin3_(FetchAndLAND_, type,1); \ 301*40e23c03SJunchao Zhang link->FetchAndLOR = CPPJoin3_(FetchAndLOR_, type,1); \ 302*40e23c03SJunchao Zhang link->FetchAndLXOR = CPPJoin3_(FetchAndLXOR_, type,1); \ 303*40e23c03SJunchao Zhang } 304*40e23c03SJunchao Zhang 305*40e23c03SJunchao Zhang 306*40e23c03SJunchao Zhang /* Bitwise Types */ 307*40e23c03SJunchao Zhang #define DEF_PackBit(type) \ 308*40e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,BAND,type,1,&,BINARY_OP,IGNORE,const type,const void) \ 309*40e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,BOR, type,1,|,BINARY_OP,IGNORE,const type,const void) \ 310*40e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,BXOR,type,1,^,BINARY_OP,IGNORE,const type,const void) \ 311*40e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, BAND,type,1,&,BINARY_OP,EXECUTE,type,void) \ 312*40e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, BOR, type,1,|,BINARY_OP,EXECUTE,type,void) \ 313*40e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, BXOR,type,1,^,BINARY_OP,EXECUTE,type,void) \ 314*40e23c03SJunchao Zhang static void CPPJoin2(PackInit_Bitwise_,type)(PetscSFPack link) { \ 315*40e23c03SJunchao Zhang link->UnpackAndBAND = CPPJoin3_(UnpackAndBAND_,type,1); \ 316*40e23c03SJunchao Zhang link->UnpackAndBOR = CPPJoin3_(UnpackAndBOR_, type,1); \ 317*40e23c03SJunchao Zhang link->UnpackAndBXOR = CPPJoin3_(UnpackAndBXOR_,type,1); \ 318*40e23c03SJunchao Zhang link->FetchAndBAND = CPPJoin3_(FetchAndBAND_, type,1); \ 319*40e23c03SJunchao Zhang link->FetchAndBOR = CPPJoin3_(FetchAndBOR_, type,1); \ 320*40e23c03SJunchao Zhang link->FetchAndBXOR = CPPJoin3_(FetchAndBXOR_, type,1); \ 321*40e23c03SJunchao Zhang } 322*40e23c03SJunchao Zhang 323*40e23c03SJunchao Zhang 324*40e23c03SJunchao Zhang /* Pair types */ 325*40e23c03SJunchao Zhang #define DEF_PackPair(type1,type2) \ 326*40e23c03SJunchao Zhang typedef struct {type1 a; type2 b;} PairType(type1,type2); \ 327*40e23c03SJunchao Zhang DEF_PackFunc(PairType(type1,type2),1) \ 328*40e23c03SJunchao Zhang DEF_Action(Unpack,PairType(type1,type2),1,IGNORE,const PairType(type1,type2),const void) \ 329*40e23c03SJunchao Zhang DEF_Action(Fetch, PairType(type1,type2),1,EXECUTE,PairType(type1,type2),void) \ 330*40e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,Add,PairType(type1,type2),1,+,PAIRTYPE_OP,IGNORE,const PairType(type1,type2),const void) \ 331*40e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, Add,PairType(type1,type2),1,+,PAIRTYPE_OP,EXECUTE,PairType(type1,type2),void) \ 332*40e23c03SJunchao Zhang DEF_ActionAndXloc(Unpack,Max,type1,type2,>,IGNORE,const PairType(type1,type2),const void) \ 333*40e23c03SJunchao Zhang DEF_ActionAndXloc(Unpack,Min,type1,type2,<,IGNORE,const PairType(type1,type2),const void) \ 334*40e23c03SJunchao Zhang DEF_ActionAndXloc(Fetch, Max,type1,type2,>,EXECUTE,PairType(type1,type2),void) \ 335*40e23c03SJunchao Zhang DEF_ActionAndXloc(Fetch, Min,type1,type2,<,EXECUTE,PairType(type1,type2),void) \ 336*40e23c03SJunchao Zhang static void CPPJoin3_(PackInit_,type1,type2)(PetscSFPack link) { \ 337*40e23c03SJunchao Zhang link->Pack = CPPJoin3_(Pack_, PairType(type1,type2),1); \ 338*40e23c03SJunchao Zhang link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,PairType(type1,type2),1); \ 339*40e23c03SJunchao Zhang link->UnpackAndAdd = CPPJoin3_(UnpackAndAdd_, PairType(type1,type2),1); \ 340*40e23c03SJunchao Zhang link->UnpackAndMaxloc = CPPJoin3_(UnpackAndMaxloc_,PairType(type1,type2),1); \ 341*40e23c03SJunchao Zhang link->UnpackAndMinloc = CPPJoin3_(UnpackAndMinloc_,PairType(type1,type2),1); \ 342*40e23c03SJunchao Zhang link->FetchAndInsert = CPPJoin3_(FetchAndInsert_, PairType(type1,type2),1); \ 343*40e23c03SJunchao Zhang link->FetchAndAdd = CPPJoin3_(FetchAndAdd_, PairType(type1,type2),1); \ 344*40e23c03SJunchao Zhang link->FetchAndMaxloc = CPPJoin3_(FetchAndMaxloc_, PairType(type1,type2),1); \ 345*40e23c03SJunchao Zhang link->FetchAndMinloc = CPPJoin3_(FetchAndMinloc_, PairType(type1,type2),1); \ 346*40e23c03SJunchao Zhang link->unitbytes = sizeof(PairType(type1,type2)); \ 347*40e23c03SJunchao Zhang } 348*40e23c03SJunchao Zhang 349*40e23c03SJunchao Zhang 350*40e23c03SJunchao Zhang /* Currently only dumb blocks of data */ 351*40e23c03SJunchao Zhang #define DEF_Block(type,count) \ 352*40e23c03SJunchao Zhang typedef struct {type v[count];} BlockType(type,count); \ 353*40e23c03SJunchao Zhang DEF_PackNoInit(BlockType(type,count),1) \ 354*40e23c03SJunchao Zhang static void CPPJoin3_(PackInit_block_,type,count)(PetscSFPack link) { \ 355*40e23c03SJunchao Zhang link->Pack = CPPJoin3_(Pack_, BlockType(type,count),1); \ 356*40e23c03SJunchao Zhang link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,BlockType(type,count),1); \ 357*40e23c03SJunchao Zhang link->FetchAndInsert = CPPJoin3_(FetchAndInsert_, BlockType(type,count),1); \ 358*40e23c03SJunchao Zhang link->unitbytes = sizeof(BlockType(type,count)); \ 359*40e23c03SJunchao Zhang } 360*40e23c03SJunchao Zhang 361*40e23c03SJunchao Zhang /* The typedef is used to get a typename without space that CPPJoin can handle */ 362*40e23c03SJunchao Zhang typedef signed char SignedChar; 363*40e23c03SJunchao Zhang typedef unsigned char UnsignedChar; 364*40e23c03SJunchao Zhang 365*40e23c03SJunchao Zhang DEF_PackCmp(SignedChar) 366*40e23c03SJunchao Zhang DEF_PackBit(SignedChar) 367*40e23c03SJunchao Zhang DEF_PackLog(SignedChar) 368*40e23c03SJunchao Zhang DEF_PackCmp(UnsignedChar) 369*40e23c03SJunchao Zhang DEF_PackBit(UnsignedChar) 370*40e23c03SJunchao Zhang DEF_PackLog(UnsignedChar) 371*40e23c03SJunchao Zhang DEF_PackCmp(int) 372*40e23c03SJunchao Zhang DEF_PackBit(int) 373*40e23c03SJunchao Zhang DEF_PackLog(int) 374*40e23c03SJunchao Zhang DEF_PackCmp(PetscInt) 375*40e23c03SJunchao Zhang DEF_PackBit(PetscInt) 376*40e23c03SJunchao Zhang DEF_PackLog(PetscInt) 377*40e23c03SJunchao Zhang DEF_Pack(PetscInt,2) 378*40e23c03SJunchao Zhang DEF_Pack(PetscInt,3) 379*40e23c03SJunchao Zhang DEF_Pack(PetscInt,4) 380*40e23c03SJunchao Zhang DEF_Pack(PetscInt,5) 381*40e23c03SJunchao Zhang DEF_Pack(PetscInt,7) 382*40e23c03SJunchao Zhang DEF_PackCmp(PetscReal) 383*40e23c03SJunchao Zhang DEF_PackLog(PetscReal) 384*40e23c03SJunchao Zhang DEF_Pack(PetscReal,2) 385*40e23c03SJunchao Zhang DEF_Pack(PetscReal,3) 386*40e23c03SJunchao Zhang DEF_Pack(PetscReal,4) 387*40e23c03SJunchao Zhang DEF_Pack(PetscReal,5) 388*40e23c03SJunchao Zhang DEF_Pack(PetscReal,7) 389*40e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 390*40e23c03SJunchao Zhang DEF_Pack(PetscComplex,1) 391*40e23c03SJunchao Zhang DEF_Pack(PetscComplex,2) 392*40e23c03SJunchao Zhang DEF_Pack(PetscComplex,3) 393*40e23c03SJunchao Zhang DEF_Pack(PetscComplex,4) 394*40e23c03SJunchao Zhang DEF_Pack(PetscComplex,5) 395*40e23c03SJunchao Zhang DEF_Pack(PetscComplex,7) 396*40e23c03SJunchao Zhang #endif 397*40e23c03SJunchao Zhang DEF_PackPair(int,int) 398*40e23c03SJunchao Zhang DEF_PackPair(PetscInt,PetscInt) 399*40e23c03SJunchao Zhang DEF_Block(int,1) 400*40e23c03SJunchao Zhang DEF_Block(int,2) 401*40e23c03SJunchao Zhang DEF_Block(int,4) 402*40e23c03SJunchao Zhang DEF_Block(int,8) 403*40e23c03SJunchao Zhang DEF_Block(char,1) 404*40e23c03SJunchao Zhang DEF_Block(char,2) 405*40e23c03SJunchao Zhang DEF_Block(char,4) 406*40e23c03SJunchao Zhang 407*40e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 408*40e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 409*40e23c03SJunchao Zhang { 410*40e23c03SJunchao Zhang int ierr; 411*40e23c03SJunchao Zhang ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 412*40e23c03SJunchao Zhang ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 413*40e23c03SJunchao Zhang return MPI_SUCCESS; 414*40e23c03SJunchao Zhang } 415*40e23c03SJunchao Zhang #endif 416*40e23c03SJunchao Zhang 417*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *key,PetscCopyMode cmode,PetscSFPack *mylink) 418*40e23c03SJunchao Zhang { 419*40e23c03SJunchao Zhang PetscErrorCode ierr; 420*40e23c03SJunchao Zhang PetscSFPack link,*p; 421*40e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 422*40e23c03SJunchao Zhang 423*40e23c03SJunchao Zhang PetscFunctionBegin; 424*40e23c03SJunchao Zhang /* Look for types in cache */ 425*40e23c03SJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 426*40e23c03SJunchao Zhang PetscBool match; 427*40e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 428*40e23c03SJunchao Zhang if (match && (key == link->key)) { 429*40e23c03SJunchao Zhang switch (cmode) { 430*40e23c03SJunchao Zhang case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 431*40e23c03SJunchao Zhang case PETSC_USE_POINTER: break; 432*40e23c03SJunchao Zhang default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 433*40e23c03SJunchao Zhang } 434*40e23c03SJunchao Zhang *mylink = link; 435*40e23c03SJunchao Zhang PetscFunctionReturn(0); 436*40e23c03SJunchao Zhang } 437*40e23c03SJunchao Zhang } 438*40e23c03SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 439*40e23c03SJunchao Zhang PetscFunctionReturn(0); 440*40e23c03SJunchao Zhang } 441*40e23c03SJunchao Zhang 442*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link) 443*40e23c03SJunchao Zhang { 444*40e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 445*40e23c03SJunchao Zhang 446*40e23c03SJunchao Zhang PetscFunctionBegin; 447*40e23c03SJunchao Zhang (*link)->key = NULL; 448*40e23c03SJunchao Zhang (*link)->next = bas->avail; 449*40e23c03SJunchao Zhang bas->avail = *link; 450*40e23c03SJunchao Zhang *link = NULL; 451*40e23c03SJunchao Zhang PetscFunctionReturn(0); 452*40e23c03SJunchao Zhang } 453*40e23c03SJunchao Zhang 454*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackSetupType(PetscSFPack link,MPI_Datatype unit) 455*40e23c03SJunchao Zhang { 456*40e23c03SJunchao Zhang PetscErrorCode ierr; 457*40e23c03SJunchao Zhang PetscBool isInt,isPetscInt,isPetscReal,is2Int,is2PetscInt,isSignedChar,isUnsignedChar; 458*40e23c03SJunchao Zhang PetscInt nPetscIntContig,nPetscRealContig; 459*40e23c03SJunchao Zhang PetscMPIInt ni,na,nd,combiner; 460*40e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 461*40e23c03SJunchao Zhang PetscBool isPetscComplex; 462*40e23c03SJunchao Zhang PetscInt nPetscComplexContig; 463*40e23c03SJunchao Zhang #endif 464*40e23c03SJunchao Zhang 465*40e23c03SJunchao Zhang PetscFunctionBegin; 466*40e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_SIGNED_CHAR,&isSignedChar);CHKERRQ(ierr); 467*40e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_UNSIGNED_CHAR,&isUnsignedChar);CHKERRQ(ierr); 468*40e23c03SJunchao Zhang /* MPI_CHAR is treated below as a dumb block type that does not support reduction according to MPI standard */ 469*40e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_INT,&isInt);CHKERRQ(ierr); 470*40e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_INT,&isPetscInt);CHKERRQ(ierr); 471*40e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT,&nPetscIntContig);CHKERRQ(ierr); 472*40e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_REAL,&isPetscReal);CHKERRQ(ierr); 473*40e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscRealContig);CHKERRQ(ierr); 474*40e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 475*40e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_COMPLEX,&isPetscComplex);CHKERRQ(ierr); 476*40e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplexContig);CHKERRQ(ierr); 477*40e23c03SJunchao Zhang #endif 478*40e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 479*40e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 480*40e23c03SJunchao Zhang ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 481*40e23c03SJunchao Zhang link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; 482*40e23c03SJunchao Zhang link->bs = 1; 483*40e23c03SJunchao Zhang 484*40e23c03SJunchao Zhang if (isSignedChar) {PackInit_SignedChar(link); PackInit_Logical_SignedChar(link); PackInit_Bitwise_SignedChar(link); link->basicunit = MPI_SIGNED_CHAR;} 485*40e23c03SJunchao Zhang else if (isUnsignedChar) {PackInit_UnsignedChar(link); PackInit_Logical_UnsignedChar(link); PackInit_Bitwise_UnsignedChar(link); link->basicunit = MPI_UNSIGNED_CHAR;} 486*40e23c03SJunchao Zhang else if (isInt) {PackInit_int(link); PackInit_Logical_int(link); PackInit_Bitwise_int(link); link->basicunit = MPI_INT;} 487*40e23c03SJunchao Zhang else if (isPetscInt) {PackInit_PetscInt(link); PackInit_Logical_PetscInt(link); PackInit_Bitwise_PetscInt(link); link->basicunit = MPIU_INT;} 488*40e23c03SJunchao Zhang else if (isPetscReal) {PackInit_PetscReal(link); PackInit_Logical_PetscReal(link); link->basicunit = MPIU_REAL;} 489*40e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 490*40e23c03SJunchao Zhang else if (isPetscComplex) {PackInit_PetscComplex_1(link); link->basicunit = MPIU_COMPLEX;} 491*40e23c03SJunchao Zhang #endif 492*40e23c03SJunchao Zhang else if (is2Int) {PackInit_int_int(link); link->basicunit = MPI_2INT;} 493*40e23c03SJunchao Zhang else if (is2PetscInt) {PackInit_PetscInt_PetscInt(link); link->basicunit = MPIU_2INT;} 494*40e23c03SJunchao Zhang else if (nPetscIntContig) { 495*40e23c03SJunchao Zhang if (nPetscIntContig%7 == 0) PackInit_PetscInt_7(link); 496*40e23c03SJunchao Zhang else if (nPetscIntContig%5 == 0) PackInit_PetscInt_5(link); 497*40e23c03SJunchao Zhang else if (nPetscIntContig%4 == 0) PackInit_PetscInt_4(link); 498*40e23c03SJunchao Zhang else if (nPetscIntContig%3 == 0) PackInit_PetscInt_3(link); 499*40e23c03SJunchao Zhang else if (nPetscIntContig%2 == 0) PackInit_PetscInt_2(link); 500*40e23c03SJunchao Zhang else PackInit_PetscInt(link); 501*40e23c03SJunchao Zhang link->bs = nPetscIntContig; 502*40e23c03SJunchao Zhang link->unitbytes *= nPetscIntContig; 503*40e23c03SJunchao Zhang link->basicunit = MPIU_INT; 504*40e23c03SJunchao Zhang } else if (nPetscRealContig) { 505*40e23c03SJunchao Zhang if (nPetscRealContig%7 == 0) PackInit_PetscReal_7(link); 506*40e23c03SJunchao Zhang else if (nPetscRealContig%5 == 0) PackInit_PetscReal_5(link); 507*40e23c03SJunchao Zhang else if (nPetscRealContig%4 == 0) PackInit_PetscReal_4(link); 508*40e23c03SJunchao Zhang else if (nPetscRealContig%3 == 0) PackInit_PetscReal_3(link); 509*40e23c03SJunchao Zhang else if (nPetscRealContig%2 == 0) PackInit_PetscReal_2(link); 510*40e23c03SJunchao Zhang else PackInit_PetscReal(link); 511*40e23c03SJunchao Zhang link->bs = nPetscRealContig; 512*40e23c03SJunchao Zhang link->unitbytes *= nPetscRealContig; 513*40e23c03SJunchao Zhang link->basicunit = MPIU_REAL; 514*40e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 515*40e23c03SJunchao Zhang } else if (nPetscComplexContig) { 516*40e23c03SJunchao Zhang if (nPetscComplexContig%7 == 0) PackInit_PetscComplex_7(link); 517*40e23c03SJunchao Zhang else if (nPetscComplexContig%5 == 0) PackInit_PetscComplex_5(link); 518*40e23c03SJunchao Zhang else if (nPetscComplexContig%4 == 0) PackInit_PetscComplex_4(link); 519*40e23c03SJunchao Zhang else if (nPetscComplexContig%3 == 0) PackInit_PetscComplex_3(link); 520*40e23c03SJunchao Zhang else if (nPetscComplexContig%2 == 0) PackInit_PetscComplex_2(link); 521*40e23c03SJunchao Zhang else PackInit_PetscComplex_1(link); 522*40e23c03SJunchao Zhang link->bs = nPetscComplexContig; 523*40e23c03SJunchao Zhang link->unitbytes *= nPetscComplexContig; 524*40e23c03SJunchao Zhang link->basicunit = MPIU_COMPLEX; 525*40e23c03SJunchao Zhang #endif 526*40e23c03SJunchao Zhang } else { 527*40e23c03SJunchao Zhang MPI_Aint lb,bytes; 528*40e23c03SJunchao Zhang ierr = MPI_Type_get_extent(unit,&lb,&bytes);CHKERRQ(ierr); 529*40e23c03SJunchao Zhang if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 530*40e23c03SJunchao Zhang if (bytes % sizeof(int)) { /* If the type size is not multiple of int */ 531*40e23c03SJunchao Zhang if (bytes%4 == 0) {PackInit_block_char_4(link); link->bs = bytes/4;} /* Note the basic type is char[4] */ 532*40e23c03SJunchao Zhang else if (bytes%2 == 0) {PackInit_block_char_2(link); link->bs = bytes/2;} 533*40e23c03SJunchao Zhang else {PackInit_block_char_1(link); link->bs = bytes/1;} 534*40e23c03SJunchao Zhang link->unitbytes = bytes; 535*40e23c03SJunchao Zhang link->basicunit = MPI_CHAR; 536*40e23c03SJunchao Zhang } else { 537*40e23c03SJunchao Zhang PetscInt nInt = bytes / sizeof(int); 538*40e23c03SJunchao Zhang if (nInt%8 == 0) {PackInit_block_int_8(link); link->bs = nInt/8;} /* Note the basic type is int[8] */ 539*40e23c03SJunchao Zhang else if (nInt%4 == 0) {PackInit_block_int_4(link); link->bs = nInt/4;} 540*40e23c03SJunchao Zhang else if (nInt%2 == 0) {PackInit_block_int_2(link); link->bs = nInt/2;} 541*40e23c03SJunchao Zhang else {PackInit_block_int_1(link); link->bs = nInt/1;} 542*40e23c03SJunchao Zhang link->unitbytes = bytes; 543*40e23c03SJunchao Zhang link->basicunit = MPI_INT; 544*40e23c03SJunchao Zhang } 545*40e23c03SJunchao Zhang } 546*40e23c03SJunchao Zhang if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */ 547*40e23c03SJunchao Zhang else {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 548*40e23c03SJunchao Zhang PetscFunctionReturn(0); 549*40e23c03SJunchao Zhang } 550*40e23c03SJunchao Zhang 551*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**UnpackAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*)) 552*40e23c03SJunchao Zhang { 553*40e23c03SJunchao Zhang PetscFunctionBegin; 554*40e23c03SJunchao Zhang *UnpackAndOp = NULL; 555*40e23c03SJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->UnpackAndInsert; 556*40e23c03SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->UnpackAndAdd; 557*40e23c03SJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->UnpackAndMult; 558*40e23c03SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->UnpackAndMax; 559*40e23c03SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->UnpackAndMin; 560*40e23c03SJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->UnpackAndLAND; 561*40e23c03SJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->UnpackAndBAND; 562*40e23c03SJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->UnpackAndLOR; 563*40e23c03SJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->UnpackAndBOR; 564*40e23c03SJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->UnpackAndLXOR; 565*40e23c03SJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->UnpackAndBXOR; 566*40e23c03SJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->UnpackAndMaxloc; 567*40e23c03SJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->UnpackAndMinloc; 568*40e23c03SJunchao Zhang else *UnpackAndOp = NULL; 569*40e23c03SJunchao Zhang PetscFunctionReturn(0); 570*40e23c03SJunchao Zhang } 571*40e23c03SJunchao Zhang 572*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackGetFetchAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**FetchAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*)) 573*40e23c03SJunchao Zhang { 574*40e23c03SJunchao Zhang PetscFunctionBegin; 575*40e23c03SJunchao Zhang *FetchAndOp = NULL; 576*40e23c03SJunchao Zhang if (op == MPIU_REPLACE) *FetchAndOp = link->FetchAndInsert; 577*40e23c03SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->FetchAndAdd; 578*40e23c03SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->FetchAndMax; 579*40e23c03SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->FetchAndMin; 580*40e23c03SJunchao Zhang else if (op == MPI_MAXLOC) *FetchAndOp = link->FetchAndMaxloc; 581*40e23c03SJunchao Zhang else if (op == MPI_MINLOC) *FetchAndOp = link->FetchAndMinloc; 582*40e23c03SJunchao Zhang else if (op == MPI_PROD) *FetchAndOp = link->FetchAndMult; 583*40e23c03SJunchao Zhang else if (op == MPI_LAND) *FetchAndOp = link->FetchAndLAND; 584*40e23c03SJunchao Zhang else if (op == MPI_BAND) *FetchAndOp = link->FetchAndBAND; 585*40e23c03SJunchao Zhang else if (op == MPI_LOR) *FetchAndOp = link->FetchAndLOR; 586*40e23c03SJunchao Zhang else if (op == MPI_BOR) *FetchAndOp = link->FetchAndBOR; 587*40e23c03SJunchao Zhang else if (op == MPI_LXOR) *FetchAndOp = link->FetchAndLXOR; 588*40e23c03SJunchao Zhang else if (op == MPI_BXOR) *FetchAndOp = link->FetchAndBXOR; 589*40e23c03SJunchao Zhang else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for MPI_Op"); 590*40e23c03SJunchao Zhang PetscFunctionReturn(0); 591*40e23c03SJunchao Zhang } 592*40e23c03SJunchao Zhang 593*40e23c03SJunchao Zhang /* 594*40e23c03SJunchao Zhang Setup pack/unpack optimization plans based on indice patterns available 595*40e23c03SJunchao Zhang 596*40e23c03SJunchao Zhang Input Parameters: 597*40e23c03SJunchao Zhang + n - number of target processors 598*40e23c03SJunchao Zhang . offset - [n+1] for the i-th processor, its associated indices are idx[offset[i], offset[i+1]) 599*40e23c03SJunchao Zhang - idx - [] array storing indices. Its length is offset[n+1] 600*40e23c03SJunchao Zhang 601*40e23c03SJunchao Zhang Output Parameters: 602*40e23c03SJunchao Zhang + opt - the optimization 603*40e23c03SJunchao Zhang */ 604*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackSetupOptimization(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 605*40e23c03SJunchao Zhang { 606*40e23c03SJunchao Zhang PetscErrorCode ierr; 607*40e23c03SJunchao Zhang PetscInt i,j,k,n_copies,tot_copies=0,step; 608*40e23c03SJunchao Zhang PetscBool strided,has_strided=PETSC_FALSE,has_optimized=PETSC_FALSE; 609*40e23c03SJunchao Zhang PetscSFPackOpt opt; 610*40e23c03SJunchao Zhang 611*40e23c03SJunchao Zhang PetscFunctionBegin; 612*40e23c03SJunchao Zhang ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr); 613*40e23c03SJunchao Zhang ierr = PetscCalloc2(n,&opt->optimized,n+1,&opt->copy_offset);CHKERRQ(ierr); 614*40e23c03SJunchao Zhang 615*40e23c03SJunchao Zhang /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */ 616*40e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each target processor */ 617*40e23c03SJunchao Zhang /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */ 618*40e23c03SJunchao Zhang n_copies = 1; 619*40e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 620*40e23c03SJunchao Zhang if (idx[j]+1 != idx[j+1]) n_copies++; 621*40e23c03SJunchao Zhang } 622*40e23c03SJunchao Zhang /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32, 623*40e23c03SJunchao Zhang then it is worth using memcpy for this target. 32 is an arbitrarily chosen number. 624*40e23c03SJunchao Zhang */ 625*40e23c03SJunchao Zhang if ((offset[i+1]-offset[i])/n_copies >= 32) { 626*40e23c03SJunchao Zhang opt->optimized[i] = PETSC_TRUE; 627*40e23c03SJunchao Zhang has_optimized = PETSC_TRUE; 628*40e23c03SJunchao Zhang tot_copies += n_copies; 629*40e23c03SJunchao Zhang } 630*40e23c03SJunchao Zhang } 631*40e23c03SJunchao Zhang 632*40e23c03SJunchao Zhang /* Setup memcpy plan for each contiguous piece */ 633*40e23c03SJunchao Zhang k = 0; /* k-th copy */ 634*40e23c03SJunchao Zhang ierr = PetscMalloc2(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length);CHKERRQ(ierr); 635*40e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each target processor procs[i] */ 636*40e23c03SJunchao Zhang if (opt->optimized[i]) { 637*40e23c03SJunchao Zhang n_copies = 1; 638*40e23c03SJunchao Zhang opt->copy_start[k] = idx[offset[i]]; 639*40e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 640*40e23c03SJunchao Zhang if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */ 641*40e23c03SJunchao Zhang n_copies++; 642*40e23c03SJunchao Zhang opt->copy_start[k+1] = idx[j+1]; 643*40e23c03SJunchao Zhang opt->copy_length[k] = idx[j] - opt->copy_start[k] + 1; 644*40e23c03SJunchao Zhang k++; 645*40e23c03SJunchao Zhang } 646*40e23c03SJunchao Zhang } 647*40e23c03SJunchao Zhang /* Set copy length of the last copy for this target */ 648*40e23c03SJunchao Zhang opt->copy_length[k] = idx[j] - opt->copy_start[k] + 1; 649*40e23c03SJunchao Zhang k++; 650*40e23c03SJunchao Zhang } 651*40e23c03SJunchao Zhang /* Set offset for next target. When optimized[i]=false, copy_offsets[i]=copy_offsets[i+1] */ 652*40e23c03SJunchao Zhang opt->copy_offset[i+1] = k; 653*40e23c03SJunchao Zhang } 654*40e23c03SJunchao Zhang 655*40e23c03SJunchao Zhang /* Last chance! If the indices do not have long contiguous pieces, are they strided? */ 656*40e23c03SJunchao Zhang ierr = PetscMalloc3(n,&opt->stride_first,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr); 657*40e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each remote */ 658*40e23c03SJunchao Zhang if (!opt->optimized[i] && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */ 659*40e23c03SJunchao Zhang strided = PETSC_TRUE; 660*40e23c03SJunchao Zhang step = idx[offset[i]+1] - idx[offset[i]]; 661*40e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 662*40e23c03SJunchao Zhang if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; } 663*40e23c03SJunchao Zhang } 664*40e23c03SJunchao Zhang if (strided) { 665*40e23c03SJunchao Zhang opt->optimized[i] = PETSC_TRUE; 666*40e23c03SJunchao Zhang opt->stride_first[i] = idx[offset[i]]; 667*40e23c03SJunchao Zhang opt->stride_step[i] = step; 668*40e23c03SJunchao Zhang opt->stride_n[i] = offset[i+1] - offset[i]; 669*40e23c03SJunchao Zhang has_strided = PETSC_TRUE; 670*40e23c03SJunchao Zhang has_optimized = PETSC_TRUE; 671*40e23c03SJunchao Zhang } 672*40e23c03SJunchao Zhang } 673*40e23c03SJunchao Zhang } 674*40e23c03SJunchao Zhang /* If no target has been stride-optimized or optimized, free related arrays to save memory */ 675*40e23c03SJunchao Zhang if (!has_strided) {ierr = PetscFree3(opt->stride_first,opt->stride_step,opt->stride_n);CHKERRQ(ierr);} 676*40e23c03SJunchao Zhang if (!has_optimized) { 677*40e23c03SJunchao Zhang ierr = PetscFree2(opt->optimized,opt->copy_offset);CHKERRQ(ierr); 678*40e23c03SJunchao Zhang ierr = PetscFree2(opt->copy_start,opt->copy_length);CHKERRQ(ierr); 679*40e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 680*40e23c03SJunchao Zhang *out = NULL; 681*40e23c03SJunchao Zhang } else *out = opt; 682*40e23c03SJunchao Zhang PetscFunctionReturn(0); 683*40e23c03SJunchao Zhang } 684*40e23c03SJunchao Zhang 685*40e23c03SJunchao Zhang PetscErrorCode PetscSFPackDestoryOptimization(PetscSFPackOpt *out) 686*40e23c03SJunchao Zhang { 687*40e23c03SJunchao Zhang PetscErrorCode ierr; 688*40e23c03SJunchao Zhang PetscSFPackOpt opt = *out; 689*40e23c03SJunchao Zhang 690*40e23c03SJunchao Zhang PetscFunctionBegin; 691*40e23c03SJunchao Zhang if (opt) { 692*40e23c03SJunchao Zhang ierr = PetscFree2(opt->optimized,opt->copy_offset);CHKERRQ(ierr); 693*40e23c03SJunchao Zhang ierr = PetscFree2(opt->copy_start,opt->copy_length);CHKERRQ(ierr); 694*40e23c03SJunchao Zhang ierr = PetscFree3(opt->stride_first,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 695*40e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 696*40e23c03SJunchao Zhang *out = NULL; 697*40e23c03SJunchao Zhang } 698*40e23c03SJunchao Zhang PetscFunctionReturn(0); 699*40e23c03SJunchao Zhang } 700