140e23c03SJunchao Zhang 240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 440e23c03SJunchao Zhang 540e23c03SJunchao Zhang /* 640e23c03SJunchao Zhang * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 740e23c03SJunchao Zhang * therefore we pack data types manually. This file defines packing routines for the standard data types. 840e23c03SJunchao Zhang */ 940e23c03SJunchao Zhang 1040e23c03SJunchao Zhang #define CPPJoin2_exp(a,b) a ## b 1140e23c03SJunchao Zhang #define CPPJoin2(a,b) CPPJoin2_exp(a,b) 1240e23c03SJunchao Zhang #define CPPJoin3_exp_(a,b,c) a ## b ## _ ## c 1340e23c03SJunchao Zhang #define CPPJoin3_(a,b,c) CPPJoin3_exp_(a,b,c) 1440e23c03SJunchao Zhang 1540e23c03SJunchao Zhang #define EXECUTE(statement) statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */ 1640e23c03SJunchao Zhang #define IGNORE(statement) do {} while(0) 1740e23c03SJunchao Zhang 1840e23c03SJunchao Zhang #define BINARY_OP(r,s,op,t) do {(r) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 1940e23c03SJunchao Zhang #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 2040e23c03SJunchao Zhang #define LXOR_OP(r,s,op,t) do {(r) = (!s) != (!t);} while(0) /* logical exclusive OR */ 2140e23c03SJunchao Zhang #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0) 2240e23c03SJunchao Zhang 2340e23c03SJunchao Zhang #define BlockType(type,count) CPPJoin3_(_blocktype_,type,count) /* typename for struct {type v[count];} */ 2440e23c03SJunchao Zhang #define PairType(type1,type2) CPPJoin3_(_pairtype_,type1,type2) /* typename for struct {type1 a; type2 b;} */ 2540e23c03SJunchao Zhang 2640e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine 2740e23c03SJunchao Zhang 2840e23c03SJunchao Zhang Arguments of the macro: 2940e23c03SJunchao Zhang +type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 3040e23c03SJunchao Zhang -BS Block size for vectorization. It is a factor of bs. 3140e23c03SJunchao Zhang 3240e23c03SJunchao Zhang Arguments of the Pack routine: 3340e23c03SJunchao Zhang +n Number of entries to pack. Each entry is of type 'unit'. Here the unit is the arg used in calls like PetscSFBcastBegin(sf,unit,..). 3440e23c03SJunchao Zhang If idx in not NULL, then n also indicates the number of indices in idx[] 3540e23c03SJunchao Zhang .bs Number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int 3640e23c03SJunchao Zhang .idx Indices of entries. NULL means contiguous indices [0,n) 3740e23c03SJunchao Zhang .r Do packing for the r-th target processor 3840e23c03SJunchao Zhang .opt Pack optimization plans. NULL means no plan. 3940e23c03SJunchao Zhang .unpacked Address of the unpacked data 4040e23c03SJunchao Zhang -packed Address of the packed data 4140e23c03SJunchao Zhang */ 4240e23c03SJunchao Zhang #define DEF_PackFunc(type,BS) \ 4340e23c03SJunchao Zhang static PetscErrorCode CPPJoin3_(Pack_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,const void *unpacked,void *packed) { \ 4440e23c03SJunchao Zhang PetscErrorCode ierr; \ 4540e23c03SJunchao Zhang const type *u = (const type*)unpacked,*u2; \ 4640e23c03SJunchao Zhang type *p = (type*)packed; \ 4740e23c03SJunchao Zhang PetscInt i,j,k,l,step; \ 4840e23c03SJunchao Zhang PetscFunctionBegin; \ 4940e23c03SJunchao Zhang if (!idx) { /* idx[] is contiguous */ \ 50da2e4c71SJunchao Zhang ierr = PetscArraycpy(p,u,bs*n);CHKERRQ(ierr); \ 5140e23c03SJunchao Zhang } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/ \ 5240e23c03SJunchao Zhang for (i=0; i<n; i++) \ 5340e23c03SJunchao Zhang for (j=0; j<bs; j+=BS) \ 5440e23c03SJunchao Zhang for (k=j; k<j+BS; k++) \ 5540e23c03SJunchao Zhang p[i*bs+k] = u[idx[i]*bs+k]; \ 5640e23c03SJunchao Zhang } else { /* idx[] is optimized*/ \ 5740e23c03SJunchao Zhang if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */ \ 5840e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \ 5940e23c03SJunchao Zhang l = opt->copy_length[i]*bs; /* length in types */ \ 6040e23c03SJunchao Zhang u2 = u + opt->copy_start[i]*bs; \ 61da2e4c71SJunchao Zhang ierr = PetscArraycpy(p,u2,l);CHKERRQ(ierr); \ 6240e23c03SJunchao Zhang p += l; \ 6340e23c03SJunchao Zhang } \ 6440e23c03SJunchao Zhang } else { /* idx[] is strided */ \ 6540e23c03SJunchao Zhang u += opt->stride_first[r]*bs; \ 6640e23c03SJunchao Zhang step = opt->stride_step[r]; \ 6740e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 6840e23c03SJunchao Zhang for (j=0; j<bs; j++) \ 6940e23c03SJunchao Zhang p[i*bs+j] = u[i*step*bs+j]; \ 7040e23c03SJunchao Zhang } \ 7140e23c03SJunchao Zhang } \ 7240e23c03SJunchao Zhang PetscFunctionReturn(0); \ 7340e23c03SJunchao Zhang } 7440e23c03SJunchao Zhang 7540e23c03SJunchao Zhang /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine 7640e23c03SJunchao Zhang 7740e23c03SJunchao Zhang Arguments: 7840e23c03SJunchao Zhang +action Unpack or Fetch 7940e23c03SJunchao Zhang .type Type of the data 8040e23c03SJunchao Zhang .BS Block size for vectorization 8140e23c03SJunchao Zhang .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 8240e23c03SJunchao Zhang .ctype Type with or without the const qualifier, i.e., const type or type 8340e23c03SJunchao Zhang .cvoid void with or without the const qualifier, i.e., const void or void 8440e23c03SJunchao Zhang 8540e23c03SJunchao Zhang Notes: 8640e23c03SJunchao Zhang This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 8740e23c03SJunchao Zhang The two arguments ctype and cvoid are used (instead of one constness argument), because we want to 8840e23c03SJunchao Zhang get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument, 8940e23c03SJunchao Zhang sometimes we input 'const', sometimes we have to input empty. 9040e23c03SJunchao Zhang */ 9140e23c03SJunchao Zhang #define DEF_Action(action,type,BS,FILTER,ctype,cvoid) \ 9240e23c03SJunchao Zhang static PetscErrorCode CPPJoin3_(action##AndInsert_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \ 9340e23c03SJunchao Zhang PetscErrorCode ierr; \ 9440e23c03SJunchao Zhang type *u = (type*)unpacked,*u2; \ 9540e23c03SJunchao Zhang ctype *p = (ctype*)packed; \ 9640e23c03SJunchao Zhang PetscInt i,j,k,l,step; \ 9740e23c03SJunchao Zhang PetscFunctionBegin; \ 9840e23c03SJunchao Zhang if (!idx) { /* idx[] is contiguous */ \ 9940e23c03SJunchao Zhang FILTER(type *v); \ 10040e23c03SJunchao Zhang FILTER(ierr = PetscMalloc1(bs*n,&v);CHKERRQ(ierr)); \ 101da2e4c71SJunchao Zhang FILTER(ierr = PetscArraycpy(v,u,bs*n);CHKERRQ(ierr)); \ 102da2e4c71SJunchao Zhang ierr = PetscArraycpy(u,p,bs*n);CHKERRQ(ierr); \ 103da2e4c71SJunchao Zhang FILTER(ierr = PetscArraycpy(p,v,bs*n);CHKERRQ(ierr)); \ 10440e23c03SJunchao Zhang FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 10540e23c03SJunchao Zhang } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/ \ 10640e23c03SJunchao Zhang for (i=0; i<n; i++) { \ 10740e23c03SJunchao Zhang for (j=0; j<bs; j+=BS) { \ 10840e23c03SJunchao Zhang for (k=j; k<j+BS; k++) { \ 10940e23c03SJunchao Zhang FILTER(type t = u[idx[i]*bs+k]); \ 11040e23c03SJunchao Zhang u[idx[i]*bs+k] = p[i*bs+k]; \ 11140e23c03SJunchao Zhang FILTER(p[i*bs+k] = t); \ 11240e23c03SJunchao Zhang } \ 11340e23c03SJunchao Zhang } \ 11440e23c03SJunchao Zhang } \ 11540e23c03SJunchao Zhang } else { /* idx[] is optimized*/ \ 11640e23c03SJunchao Zhang if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */ \ 11740e23c03SJunchao Zhang FILTER(type *v); \ 11840e23c03SJunchao Zhang FILTER(ierr = PetscMalloc1(bs*n,&v);CHKERRQ(ierr)); /* maximal buffer */ \ 11940e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 12040e23c03SJunchao Zhang l = opt->copy_length[i]*bs; /* length in types */ \ 12140e23c03SJunchao Zhang u2 = u + opt->copy_start[i]*bs; \ 122da2e4c71SJunchao Zhang FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr)); \ 123da2e4c71SJunchao Zhang ierr = PetscArraycpy(u2,p,l);CHKERRQ(ierr); \ 124da2e4c71SJunchao Zhang FILTER(ierr = PetscArraycpy(p,v,l);CHKERRQ(ierr)); \ 12540e23c03SJunchao Zhang p += l; \ 12640e23c03SJunchao Zhang } \ 12740e23c03SJunchao Zhang FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 12840e23c03SJunchao Zhang } else { /* idx[] is strided */ \ 12940e23c03SJunchao Zhang u += opt->stride_first[r]*bs; \ 13040e23c03SJunchao Zhang step = opt->stride_step[r]; \ 13140e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 13240e23c03SJunchao Zhang for (j=0; j<bs; j++) { \ 13340e23c03SJunchao Zhang FILTER(type t = u[i*step*bs+j]); \ 13440e23c03SJunchao Zhang u[i*step*bs+j] = p[i*bs+j]; \ 13540e23c03SJunchao Zhang FILTER(p[i*bs+j] = t); \ 13640e23c03SJunchao Zhang } \ 13740e23c03SJunchao Zhang } \ 13840e23c03SJunchao Zhang } \ 13940e23c03SJunchao Zhang PetscFunctionReturn(0); \ 14040e23c03SJunchao Zhang } 14140e23c03SJunchao Zhang 14240e23c03SJunchao Zhang /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc 14340e23c03SJunchao Zhang 14440e23c03SJunchao Zhang Arguments: 14540e23c03SJunchao Zhang +action Unpack or Fetch 14640e23c03SJunchao Zhang .opname Name of the Op, such as Add, Mult, LAND, etc. 14740e23c03SJunchao Zhang .type Type of the data 14840e23c03SJunchao Zhang .BS Block size for vectorization 14940e23c03SJunchao Zhang .op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 15040e23c03SJunchao Zhang .APPLY Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP 15140e23c03SJunchao Zhang .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 15240e23c03SJunchao Zhang .ctype Type with or without the const qualifier, i.e., const type or type 15340e23c03SJunchao Zhang -cvoid void with or without the const qualifier, i.e., const void or void 15440e23c03SJunchao Zhang */ 15540e23c03SJunchao Zhang #define DEF_ActionAndOp(action,opname,type,BS,op,APPLY,FILTER,ctype,cvoid) \ 15640e23c03SJunchao Zhang static PetscErrorCode CPPJoin3_(action##And##opname##_,type,BS)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \ 15740e23c03SJunchao Zhang type *u = (type*)unpacked,*u2,t; \ 15840e23c03SJunchao Zhang ctype *p = (ctype*)packed; \ 15940e23c03SJunchao Zhang PetscInt i,j,k,l,step; \ 16040e23c03SJunchao Zhang PetscFunctionBegin; \ 16140e23c03SJunchao Zhang if (!idx) { /* idx[] is contiguous */ \ 16240e23c03SJunchao Zhang for (i=0; i<n*bs; i++) { \ 16340e23c03SJunchao Zhang t = u[i]; \ 16440e23c03SJunchao Zhang APPLY(u[i],t,op,p[i]); \ 16540e23c03SJunchao Zhang FILTER(p[i] = t); \ 16640e23c03SJunchao Zhang } \ 16740e23c03SJunchao Zhang } else if (!opt || !opt->optimized[r]) { /* idx[] is not optimized*/ \ 16840e23c03SJunchao Zhang for (i=0; i<n; i++) { \ 16940e23c03SJunchao Zhang for (j=0; j<bs; j+=BS) { \ 17040e23c03SJunchao Zhang for (k=j; k<j+BS; k++) { \ 17140e23c03SJunchao Zhang t = u[idx[i]*bs+k]; \ 17240e23c03SJunchao Zhang APPLY(u[idx[i]*bs+k],t,op,p[i*bs+k]); \ 17340e23c03SJunchao Zhang FILTER(p[i*bs+k] = t); \ 17440e23c03SJunchao Zhang } \ 17540e23c03SJunchao Zhang } \ 17640e23c03SJunchao Zhang } \ 17740e23c03SJunchao Zhang } else { /* idx[] is optimized*/ \ 17840e23c03SJunchao Zhang if (opt->copy_offset[r] != opt->copy_offset[r+1]) { /* idx[] is piece-wise contiguous */ \ 17940e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 18040e23c03SJunchao Zhang l = opt->copy_length[i]*bs; /* length in types */ \ 18140e23c03SJunchao Zhang u2 = u + opt->copy_start[i]*bs; \ 18240e23c03SJunchao Zhang for (j=0; j<l; j++) { \ 18340e23c03SJunchao Zhang t = u2[j]; \ 18440e23c03SJunchao Zhang APPLY(u2[j],t,op,p[j]); \ 18540e23c03SJunchao Zhang FILTER(p[j] = t); \ 18640e23c03SJunchao Zhang } \ 18740e23c03SJunchao Zhang p += l; \ 18840e23c03SJunchao Zhang } \ 18940e23c03SJunchao Zhang } else { /* idx[] is strided */ \ 19040e23c03SJunchao Zhang u += opt->stride_first[r]*bs; \ 19140e23c03SJunchao Zhang step = opt->stride_step[r]; \ 19240e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 19340e23c03SJunchao Zhang for (j=0; j<bs; j++) { \ 19440e23c03SJunchao Zhang t = u[i*step*bs+j]; \ 19540e23c03SJunchao Zhang APPLY(u[i*step*bs+j],t,op,p[i*bs+j]); \ 19640e23c03SJunchao Zhang FILTER(p[i*bs+j] = t); \ 19740e23c03SJunchao Zhang } \ 19840e23c03SJunchao Zhang } \ 19940e23c03SJunchao Zhang } \ 20040e23c03SJunchao Zhang PetscFunctionReturn(0); \ 20140e23c03SJunchao Zhang } 20240e23c03SJunchao Zhang 20340e23c03SJunchao Zhang /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine 20440e23c03SJunchao Zhang 20540e23c03SJunchao Zhang Arguments: 20640e23c03SJunchao Zhang +Action Unpack or Fetch 20740e23c03SJunchao Zhang .locname Max or Min 20840e23c03SJunchao Zhang .type1 Type of the first data in a pair type 20940e23c03SJunchao Zhang .type2 Type of the second data in a pair type, usually PetscMPIInt for MPI ranks. 21040e23c03SJunchao Zhang .op > or < 21140e23c03SJunchao Zhang .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 21240e23c03SJunchao Zhang .ctype Type with or without the const qualifier, i.e., const PairType(type1,type2) or PairType(type1,type2) 21340e23c03SJunchao Zhang -cvoid void with or without the const qualifier, i.e., const void or void 21440e23c03SJunchao Zhang */ 21540e23c03SJunchao Zhang #define DEF_ActionAndXloc(action,locname,type1,type2,op,FILTER,ctype,cvoid) \ 21640e23c03SJunchao Zhang static PetscErrorCode CPPJoin3_(action##And##locname##loc_,PairType(type1,type2),1)(PetscInt n,PetscInt bs,const PetscInt *idx,PetscInt r,PetscSFPackOpt opt,void *unpacked,cvoid *packed) { \ 21740e23c03SJunchao Zhang PairType(type1,type2) *u = (PairType(type1,type2)*)unpacked; \ 21840e23c03SJunchao Zhang ctype *p = (ctype*)packed; \ 21940e23c03SJunchao Zhang PetscInt i; \ 22040e23c03SJunchao Zhang for (i=0; i<n; i++) { \ 22140e23c03SJunchao Zhang PetscInt j = idx[i]; \ 22240e23c03SJunchao Zhang FILTER(PairType(type1,type2) v = u[j]); \ 22340e23c03SJunchao Zhang if (p[i].a op u[j].a) { \ 22440e23c03SJunchao Zhang u[j] = p[i]; \ 22540e23c03SJunchao Zhang } else if (p[i].a == u[j].a) { \ 22640e23c03SJunchao Zhang u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */ \ 22740e23c03SJunchao Zhang } \ 22840e23c03SJunchao Zhang FILTER(p[i] = v); \ 22940e23c03SJunchao Zhang } \ 23040e23c03SJunchao Zhang PetscFunctionReturn(0); \ 23140e23c03SJunchao Zhang } 23240e23c03SJunchao Zhang 23340e23c03SJunchao Zhang 23440e23c03SJunchao Zhang /* Pack/unpack/fetch ops for all types */ 23540e23c03SJunchao Zhang #define DEF_PackNoInit(type,BS) \ 23640e23c03SJunchao Zhang DEF_PackFunc(type,BS) \ 23740e23c03SJunchao Zhang DEF_Action(Unpack,type,BS,IGNORE,const type,const void) \ 23840e23c03SJunchao Zhang DEF_Action(Fetch, type,BS,EXECUTE,type,void) \ 23940e23c03SJunchao Zhang 24040e23c03SJunchao Zhang 24140e23c03SJunchao Zhang /* Extra addition ops for types supporting them */ 24240e23c03SJunchao Zhang #define DEF_PackAddNoInit(type,BS) \ 24340e23c03SJunchao Zhang DEF_PackNoInit(type,BS) \ 24440e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,Add, type,BS,+,BINARY_OP,IGNORE,const type,const void) \ 24540e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,Mult,type,BS,*,BINARY_OP,IGNORE,const type,const void) \ 24640e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, Add, type,BS,+,BINARY_OP,EXECUTE,type,void) \ 24740e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, Mult,type,BS,*,BINARY_OP,EXECUTE,type,void) 24840e23c03SJunchao Zhang 24940e23c03SJunchao Zhang /* Basic types */ 25040e23c03SJunchao Zhang #define DEF_Pack(type,BS) \ 25140e23c03SJunchao Zhang DEF_PackAddNoInit(type,BS) \ 25240e23c03SJunchao Zhang static void CPPJoin3_(PackInit_,type,BS)(PetscSFPack link) { \ 25340e23c03SJunchao Zhang link->Pack = CPPJoin3_(Pack_, type,BS); \ 25440e23c03SJunchao Zhang link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,type,BS); \ 25540e23c03SJunchao Zhang link->UnpackAndAdd = CPPJoin3_(UnpackAndAdd_, type,BS); \ 25640e23c03SJunchao Zhang link->UnpackAndMult = CPPJoin3_(UnpackAndMult_, type,BS); \ 25740e23c03SJunchao Zhang link->FetchAndInsert = CPPJoin3_(FetchAndInsert_, type,BS); \ 25840e23c03SJunchao Zhang link->FetchAndAdd = CPPJoin3_(FetchAndAdd_, type,BS); \ 25940e23c03SJunchao Zhang link->FetchAndMult = CPPJoin3_(FetchAndMult_, type,BS); \ 26040e23c03SJunchao Zhang link->unitbytes = sizeof(type); \ 26140e23c03SJunchao Zhang } 26240e23c03SJunchao Zhang 26340e23c03SJunchao Zhang /* Comparable types */ 26440e23c03SJunchao Zhang #define DEF_PackCmp(type) \ 26540e23c03SJunchao Zhang DEF_PackAddNoInit(type,1) \ 26640e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,Max,type,1,PetscMax,FUNCTION_OP,IGNORE,const type,const void) \ 26740e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,Min,type,1,PetscMin,FUNCTION_OP,IGNORE,const type,const void) \ 26840e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, Max,type,1,PetscMax,FUNCTION_OP,EXECUTE,type,void) \ 26940e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, Min,type,1,PetscMin,FUNCTION_OP,EXECUTE,type,void) \ 27040e23c03SJunchao Zhang static void CPPJoin2(PackInit_,type)(PetscSFPack link) { \ 27140e23c03SJunchao Zhang link->Pack = CPPJoin3_(Pack_, type,1); \ 27240e23c03SJunchao Zhang link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,type,1); \ 27340e23c03SJunchao Zhang link->UnpackAndAdd = CPPJoin3_(UnpackAndAdd_, type,1); \ 27440e23c03SJunchao Zhang link->UnpackAndMult = CPPJoin3_(UnpackAndMult_, type,1); \ 27540e23c03SJunchao Zhang link->UnpackAndMax = CPPJoin3_(UnpackAndMax_, type,1); \ 27640e23c03SJunchao Zhang link->UnpackAndMin = CPPJoin3_(UnpackAndMin_, type,1); \ 27740e23c03SJunchao Zhang link->FetchAndInsert = CPPJoin3_(FetchAndInsert_, type,1); \ 27840e23c03SJunchao Zhang link->FetchAndAdd = CPPJoin3_(FetchAndAdd_ , type,1); \ 27940e23c03SJunchao Zhang link->FetchAndMult = CPPJoin3_(FetchAndMult_, type,1); \ 28040e23c03SJunchao Zhang link->FetchAndMax = CPPJoin3_(FetchAndMax_ , type,1); \ 28140e23c03SJunchao Zhang link->FetchAndMin = CPPJoin3_(FetchAndMin_ , type,1); \ 28240e23c03SJunchao Zhang link->unitbytes = sizeof(type); \ 28340e23c03SJunchao Zhang } 28440e23c03SJunchao Zhang 28540e23c03SJunchao Zhang /* Logical Types */ 28640e23c03SJunchao Zhang /* The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid 28740e23c03SJunchao Zhang the compilation warning "empty macro arguments are undefined in ISO C90" 28840e23c03SJunchao Zhang */ 28940e23c03SJunchao Zhang #define DEF_PackLog(type) \ 29040e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,LAND,type,1,&&,BINARY_OP,IGNORE,const type,const void) \ 29140e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,LOR, type,1,||,BINARY_OP,IGNORE,const type,const void) \ 29240e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,LXOR,type,1,&, LXOR_OP, IGNORE,const type,const void) \ 29340e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, LAND,type,1,&&,BINARY_OP,EXECUTE,type,void) \ 29440e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, LOR, type,1,||,BINARY_OP,EXECUTE,type,void) \ 29540e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, LXOR,type,1,&, LXOR_OP, EXECUTE,type,void) \ 29640e23c03SJunchao Zhang static void CPPJoin2(PackInit_Logical_,type)(PetscSFPack link) { \ 29740e23c03SJunchao Zhang link->UnpackAndLAND = CPPJoin3_(UnpackAndLAND_,type,1); \ 29840e23c03SJunchao Zhang link->UnpackAndLOR = CPPJoin3_(UnpackAndLOR_, type,1); \ 29940e23c03SJunchao Zhang link->UnpackAndLXOR = CPPJoin3_(UnpackAndLXOR_,type,1); \ 30040e23c03SJunchao Zhang link->FetchAndLAND = CPPJoin3_(FetchAndLAND_, type,1); \ 30140e23c03SJunchao Zhang link->FetchAndLOR = CPPJoin3_(FetchAndLOR_, type,1); \ 30240e23c03SJunchao Zhang link->FetchAndLXOR = CPPJoin3_(FetchAndLXOR_, type,1); \ 30340e23c03SJunchao Zhang } 30440e23c03SJunchao Zhang 30540e23c03SJunchao Zhang 30640e23c03SJunchao Zhang /* Bitwise Types */ 30740e23c03SJunchao Zhang #define DEF_PackBit(type) \ 30840e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,BAND,type,1,&,BINARY_OP,IGNORE,const type,const void) \ 30940e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,BOR, type,1,|,BINARY_OP,IGNORE,const type,const void) \ 31040e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,BXOR,type,1,^,BINARY_OP,IGNORE,const type,const void) \ 31140e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, BAND,type,1,&,BINARY_OP,EXECUTE,type,void) \ 31240e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, BOR, type,1,|,BINARY_OP,EXECUTE,type,void) \ 31340e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, BXOR,type,1,^,BINARY_OP,EXECUTE,type,void) \ 31440e23c03SJunchao Zhang static void CPPJoin2(PackInit_Bitwise_,type)(PetscSFPack link) { \ 31540e23c03SJunchao Zhang link->UnpackAndBAND = CPPJoin3_(UnpackAndBAND_,type,1); \ 31640e23c03SJunchao Zhang link->UnpackAndBOR = CPPJoin3_(UnpackAndBOR_, type,1); \ 31740e23c03SJunchao Zhang link->UnpackAndBXOR = CPPJoin3_(UnpackAndBXOR_,type,1); \ 31840e23c03SJunchao Zhang link->FetchAndBAND = CPPJoin3_(FetchAndBAND_, type,1); \ 31940e23c03SJunchao Zhang link->FetchAndBOR = CPPJoin3_(FetchAndBOR_, type,1); \ 32040e23c03SJunchao Zhang link->FetchAndBXOR = CPPJoin3_(FetchAndBXOR_, type,1); \ 32140e23c03SJunchao Zhang } 32240e23c03SJunchao Zhang 32340e23c03SJunchao Zhang 32440e23c03SJunchao Zhang /* Pair types */ 32540e23c03SJunchao Zhang #define DEF_PackPair(type1,type2) \ 32640e23c03SJunchao Zhang typedef struct {type1 a; type2 b;} PairType(type1,type2); \ 32740e23c03SJunchao Zhang DEF_PackFunc(PairType(type1,type2),1) \ 32840e23c03SJunchao Zhang DEF_Action(Unpack,PairType(type1,type2),1,IGNORE,const PairType(type1,type2),const void) \ 32940e23c03SJunchao Zhang DEF_Action(Fetch, PairType(type1,type2),1,EXECUTE,PairType(type1,type2),void) \ 33040e23c03SJunchao Zhang DEF_ActionAndOp(Unpack,Add,PairType(type1,type2),1,+,PAIRTYPE_OP,IGNORE,const PairType(type1,type2),const void) \ 33140e23c03SJunchao Zhang DEF_ActionAndOp(Fetch, Add,PairType(type1,type2),1,+,PAIRTYPE_OP,EXECUTE,PairType(type1,type2),void) \ 33240e23c03SJunchao Zhang DEF_ActionAndXloc(Unpack,Max,type1,type2,>,IGNORE,const PairType(type1,type2),const void) \ 33340e23c03SJunchao Zhang DEF_ActionAndXloc(Unpack,Min,type1,type2,<,IGNORE,const PairType(type1,type2),const void) \ 33440e23c03SJunchao Zhang DEF_ActionAndXloc(Fetch, Max,type1,type2,>,EXECUTE,PairType(type1,type2),void) \ 33540e23c03SJunchao Zhang DEF_ActionAndXloc(Fetch, Min,type1,type2,<,EXECUTE,PairType(type1,type2),void) \ 33640e23c03SJunchao Zhang static void CPPJoin3_(PackInit_,type1,type2)(PetscSFPack link) { \ 33740e23c03SJunchao Zhang link->Pack = CPPJoin3_(Pack_, PairType(type1,type2),1); \ 33840e23c03SJunchao Zhang link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,PairType(type1,type2),1); \ 33940e23c03SJunchao Zhang link->UnpackAndAdd = CPPJoin3_(UnpackAndAdd_, PairType(type1,type2),1); \ 34040e23c03SJunchao Zhang link->UnpackAndMaxloc = CPPJoin3_(UnpackAndMaxloc_,PairType(type1,type2),1); \ 34140e23c03SJunchao Zhang link->UnpackAndMinloc = CPPJoin3_(UnpackAndMinloc_,PairType(type1,type2),1); \ 34240e23c03SJunchao Zhang link->FetchAndInsert = CPPJoin3_(FetchAndInsert_, PairType(type1,type2),1); \ 34340e23c03SJunchao Zhang link->FetchAndAdd = CPPJoin3_(FetchAndAdd_, PairType(type1,type2),1); \ 34440e23c03SJunchao Zhang link->FetchAndMaxloc = CPPJoin3_(FetchAndMaxloc_, PairType(type1,type2),1); \ 34540e23c03SJunchao Zhang link->FetchAndMinloc = CPPJoin3_(FetchAndMinloc_, PairType(type1,type2),1); \ 34640e23c03SJunchao Zhang link->unitbytes = sizeof(PairType(type1,type2)); \ 34740e23c03SJunchao Zhang } 34840e23c03SJunchao Zhang 34940e23c03SJunchao Zhang 35040e23c03SJunchao Zhang /* Currently only dumb blocks of data */ 35140e23c03SJunchao Zhang #define DEF_Block(type,count) \ 35240e23c03SJunchao Zhang typedef struct {type v[count];} BlockType(type,count); \ 35340e23c03SJunchao Zhang DEF_PackNoInit(BlockType(type,count),1) \ 35440e23c03SJunchao Zhang static void CPPJoin3_(PackInit_block_,type,count)(PetscSFPack link) { \ 35540e23c03SJunchao Zhang link->Pack = CPPJoin3_(Pack_, BlockType(type,count),1); \ 35640e23c03SJunchao Zhang link->UnpackAndInsert = CPPJoin3_(UnpackAndInsert_,BlockType(type,count),1); \ 35740e23c03SJunchao Zhang link->FetchAndInsert = CPPJoin3_(FetchAndInsert_, BlockType(type,count),1); \ 35840e23c03SJunchao Zhang link->unitbytes = sizeof(BlockType(type,count)); \ 35940e23c03SJunchao Zhang } 36040e23c03SJunchao Zhang 36140e23c03SJunchao Zhang /* The typedef is used to get a typename without space that CPPJoin can handle */ 36240e23c03SJunchao Zhang typedef signed char SignedChar; 36340e23c03SJunchao Zhang typedef unsigned char UnsignedChar; 36440e23c03SJunchao Zhang 36540e23c03SJunchao Zhang DEF_PackCmp(SignedChar) 36640e23c03SJunchao Zhang DEF_PackBit(SignedChar) 36740e23c03SJunchao Zhang DEF_PackLog(SignedChar) 36840e23c03SJunchao Zhang DEF_PackCmp(UnsignedChar) 36940e23c03SJunchao Zhang DEF_PackBit(UnsignedChar) 37040e23c03SJunchao Zhang DEF_PackLog(UnsignedChar) 37140e23c03SJunchao Zhang DEF_PackCmp(int) 37240e23c03SJunchao Zhang DEF_PackBit(int) 37340e23c03SJunchao Zhang DEF_PackLog(int) 37440e23c03SJunchao Zhang DEF_PackCmp(PetscInt) 37540e23c03SJunchao Zhang DEF_PackBit(PetscInt) 37640e23c03SJunchao Zhang DEF_PackLog(PetscInt) 37740e23c03SJunchao Zhang DEF_Pack(PetscInt,2) 37840e23c03SJunchao Zhang DEF_Pack(PetscInt,3) 37940e23c03SJunchao Zhang DEF_Pack(PetscInt,4) 38040e23c03SJunchao Zhang DEF_Pack(PetscInt,5) 38140e23c03SJunchao Zhang DEF_Pack(PetscInt,7) 38240e23c03SJunchao Zhang DEF_PackCmp(PetscReal) 38340e23c03SJunchao Zhang DEF_PackLog(PetscReal) 38440e23c03SJunchao Zhang DEF_Pack(PetscReal,2) 38540e23c03SJunchao Zhang DEF_Pack(PetscReal,3) 38640e23c03SJunchao Zhang DEF_Pack(PetscReal,4) 38740e23c03SJunchao Zhang DEF_Pack(PetscReal,5) 38840e23c03SJunchao Zhang DEF_Pack(PetscReal,7) 38940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 39040e23c03SJunchao Zhang DEF_Pack(PetscComplex,1) 39140e23c03SJunchao Zhang DEF_Pack(PetscComplex,2) 39240e23c03SJunchao Zhang DEF_Pack(PetscComplex,3) 39340e23c03SJunchao Zhang DEF_Pack(PetscComplex,4) 39440e23c03SJunchao Zhang DEF_Pack(PetscComplex,5) 39540e23c03SJunchao Zhang DEF_Pack(PetscComplex,7) 39640e23c03SJunchao Zhang #endif 39740e23c03SJunchao Zhang DEF_PackPair(int,int) 39840e23c03SJunchao Zhang DEF_PackPair(PetscInt,PetscInt) 39940e23c03SJunchao Zhang DEF_Block(int,1) 40040e23c03SJunchao Zhang DEF_Block(int,2) 40140e23c03SJunchao Zhang DEF_Block(int,4) 40240e23c03SJunchao Zhang DEF_Block(int,8) 40340e23c03SJunchao Zhang DEF_Block(char,1) 40440e23c03SJunchao Zhang DEF_Block(char,2) 40540e23c03SJunchao Zhang DEF_Block(char,4) 40640e23c03SJunchao Zhang 40740e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 40840e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 40940e23c03SJunchao Zhang { 41040e23c03SJunchao Zhang int ierr; 41140e23c03SJunchao Zhang ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 41240e23c03SJunchao Zhang ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 41340e23c03SJunchao Zhang return MPI_SUCCESS; 41440e23c03SJunchao Zhang } 41540e23c03SJunchao Zhang #endif 41640e23c03SJunchao Zhang 4179d1c8addSJunchao Zhang PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey,PetscCopyMode cmode,PetscSFPack *mylink) 41840e23c03SJunchao Zhang { 41940e23c03SJunchao Zhang PetscErrorCode ierr; 42040e23c03SJunchao Zhang PetscSFPack link,*p; 42140e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 42240e23c03SJunchao Zhang 42340e23c03SJunchao Zhang PetscFunctionBegin; 42440e23c03SJunchao Zhang /* Look for types in cache */ 42540e23c03SJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 42640e23c03SJunchao Zhang PetscBool match; 42740e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 4289d1c8addSJunchao Zhang if (match && (rkey == link->rkey) && (lkey == link->lkey)) { 42940e23c03SJunchao Zhang switch (cmode) { 43040e23c03SJunchao Zhang case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 43140e23c03SJunchao Zhang case PETSC_USE_POINTER: break; 43240e23c03SJunchao Zhang default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 43340e23c03SJunchao Zhang } 43440e23c03SJunchao Zhang *mylink = link; 43540e23c03SJunchao Zhang PetscFunctionReturn(0); 43640e23c03SJunchao Zhang } 43740e23c03SJunchao Zhang } 43840e23c03SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 43940e23c03SJunchao Zhang PetscFunctionReturn(0); 44040e23c03SJunchao Zhang } 44140e23c03SJunchao Zhang 44240e23c03SJunchao Zhang PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link) 44340e23c03SJunchao Zhang { 44440e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 44540e23c03SJunchao Zhang 44640e23c03SJunchao Zhang PetscFunctionBegin; 4479d1c8addSJunchao Zhang (*link)->rkey = NULL; 4489d1c8addSJunchao Zhang (*link)->lkey = NULL; 44940e23c03SJunchao Zhang (*link)->next = bas->avail; 45040e23c03SJunchao Zhang bas->avail = *link; 45140e23c03SJunchao Zhang *link = NULL; 45240e23c03SJunchao Zhang PetscFunctionReturn(0); 45340e23c03SJunchao Zhang } 45440e23c03SJunchao Zhang 4559d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */ 4569d1c8addSJunchao Zhang PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rkey,const void *lkey) 4579d1c8addSJunchao Zhang { 4589d1c8addSJunchao Zhang PetscErrorCode ierr; 4599d1c8addSJunchao Zhang PetscSFPack link,*p; 4609d1c8addSJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 4619d1c8addSJunchao Zhang PetscBool match; 4629d1c8addSJunchao Zhang 4639d1c8addSJunchao Zhang PetscFunctionBegin; 464*18fb5014SJunchao Zhang /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 465*18fb5014SJunchao Zhang the potential overlapping since this process does not participate in communication. Overlapping is harmless. 466*18fb5014SJunchao Zhang */ 467*18fb5014SJunchao Zhang if (rkey || lkey) { 4689d1c8addSJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 4699d1c8addSJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 47033c49614SJunchao Zhang if (match && (rkey == link->rkey) && (lkey == link->lkey)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rkey,lkey); 4719d1c8addSJunchao Zhang } 472*18fb5014SJunchao Zhang } 4739d1c8addSJunchao Zhang PetscFunctionReturn(0); 4749d1c8addSJunchao Zhang } 4759d1c8addSJunchao Zhang 47640e23c03SJunchao Zhang PetscErrorCode PetscSFPackSetupType(PetscSFPack link,MPI_Datatype unit) 47740e23c03SJunchao Zhang { 47840e23c03SJunchao Zhang PetscErrorCode ierr; 47940e23c03SJunchao Zhang PetscBool isInt,isPetscInt,isPetscReal,is2Int,is2PetscInt,isSignedChar,isUnsignedChar; 48040e23c03SJunchao Zhang PetscInt nPetscIntContig,nPetscRealContig; 48140e23c03SJunchao Zhang PetscMPIInt ni,na,nd,combiner; 48240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 48340e23c03SJunchao Zhang PetscBool isPetscComplex; 48440e23c03SJunchao Zhang PetscInt nPetscComplexContig; 48540e23c03SJunchao Zhang #endif 48640e23c03SJunchao Zhang 48740e23c03SJunchao Zhang PetscFunctionBegin; 48840e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_SIGNED_CHAR,&isSignedChar);CHKERRQ(ierr); 48940e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_UNSIGNED_CHAR,&isUnsignedChar);CHKERRQ(ierr); 49040e23c03SJunchao Zhang /* MPI_CHAR is treated below as a dumb block type that does not support reduction according to MPI standard */ 49140e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_INT,&isInt);CHKERRQ(ierr); 49240e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_INT,&isPetscInt);CHKERRQ(ierr); 49340e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT,&nPetscIntContig);CHKERRQ(ierr); 49440e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_REAL,&isPetscReal);CHKERRQ(ierr); 49540e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscRealContig);CHKERRQ(ierr); 49640e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 49740e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_COMPLEX,&isPetscComplex);CHKERRQ(ierr); 49840e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplexContig);CHKERRQ(ierr); 49940e23c03SJunchao Zhang #endif 50040e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 50140e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 50240e23c03SJunchao Zhang ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 50340e23c03SJunchao Zhang link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; 50440e23c03SJunchao Zhang link->bs = 1; 50540e23c03SJunchao Zhang 50640e23c03SJunchao Zhang if (isSignedChar) {PackInit_SignedChar(link); PackInit_Logical_SignedChar(link); PackInit_Bitwise_SignedChar(link); link->basicunit = MPI_SIGNED_CHAR;} 50740e23c03SJunchao Zhang else if (isUnsignedChar) {PackInit_UnsignedChar(link); PackInit_Logical_UnsignedChar(link); PackInit_Bitwise_UnsignedChar(link); link->basicunit = MPI_UNSIGNED_CHAR;} 50840e23c03SJunchao Zhang else if (isInt) {PackInit_int(link); PackInit_Logical_int(link); PackInit_Bitwise_int(link); link->basicunit = MPI_INT;} 50940e23c03SJunchao Zhang else if (isPetscInt) {PackInit_PetscInt(link); PackInit_Logical_PetscInt(link); PackInit_Bitwise_PetscInt(link); link->basicunit = MPIU_INT;} 51040e23c03SJunchao Zhang else if (isPetscReal) {PackInit_PetscReal(link); PackInit_Logical_PetscReal(link); link->basicunit = MPIU_REAL;} 51140e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 51240e23c03SJunchao Zhang else if (isPetscComplex) {PackInit_PetscComplex_1(link); link->basicunit = MPIU_COMPLEX;} 51340e23c03SJunchao Zhang #endif 51440e23c03SJunchao Zhang else if (is2Int) {PackInit_int_int(link); link->basicunit = MPI_2INT;} 51540e23c03SJunchao Zhang else if (is2PetscInt) {PackInit_PetscInt_PetscInt(link); link->basicunit = MPIU_2INT;} 51640e23c03SJunchao Zhang else if (nPetscIntContig) { 51740e23c03SJunchao Zhang if (nPetscIntContig%7 == 0) PackInit_PetscInt_7(link); 51840e23c03SJunchao Zhang else if (nPetscIntContig%5 == 0) PackInit_PetscInt_5(link); 51940e23c03SJunchao Zhang else if (nPetscIntContig%4 == 0) PackInit_PetscInt_4(link); 52040e23c03SJunchao Zhang else if (nPetscIntContig%3 == 0) PackInit_PetscInt_3(link); 52140e23c03SJunchao Zhang else if (nPetscIntContig%2 == 0) PackInit_PetscInt_2(link); 52240e23c03SJunchao Zhang else PackInit_PetscInt(link); 52340e23c03SJunchao Zhang link->bs = nPetscIntContig; 52440e23c03SJunchao Zhang link->unitbytes *= nPetscIntContig; 52540e23c03SJunchao Zhang link->basicunit = MPIU_INT; 52640e23c03SJunchao Zhang } else if (nPetscRealContig) { 52740e23c03SJunchao Zhang if (nPetscRealContig%7 == 0) PackInit_PetscReal_7(link); 52840e23c03SJunchao Zhang else if (nPetscRealContig%5 == 0) PackInit_PetscReal_5(link); 52940e23c03SJunchao Zhang else if (nPetscRealContig%4 == 0) PackInit_PetscReal_4(link); 53040e23c03SJunchao Zhang else if (nPetscRealContig%3 == 0) PackInit_PetscReal_3(link); 53140e23c03SJunchao Zhang else if (nPetscRealContig%2 == 0) PackInit_PetscReal_2(link); 53240e23c03SJunchao Zhang else PackInit_PetscReal(link); 53340e23c03SJunchao Zhang link->bs = nPetscRealContig; 53440e23c03SJunchao Zhang link->unitbytes *= nPetscRealContig; 53540e23c03SJunchao Zhang link->basicunit = MPIU_REAL; 53640e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 53740e23c03SJunchao Zhang } else if (nPetscComplexContig) { 53840e23c03SJunchao Zhang if (nPetscComplexContig%7 == 0) PackInit_PetscComplex_7(link); 53940e23c03SJunchao Zhang else if (nPetscComplexContig%5 == 0) PackInit_PetscComplex_5(link); 54040e23c03SJunchao Zhang else if (nPetscComplexContig%4 == 0) PackInit_PetscComplex_4(link); 54140e23c03SJunchao Zhang else if (nPetscComplexContig%3 == 0) PackInit_PetscComplex_3(link); 54240e23c03SJunchao Zhang else if (nPetscComplexContig%2 == 0) PackInit_PetscComplex_2(link); 54340e23c03SJunchao Zhang else PackInit_PetscComplex_1(link); 54440e23c03SJunchao Zhang link->bs = nPetscComplexContig; 54540e23c03SJunchao Zhang link->unitbytes *= nPetscComplexContig; 54640e23c03SJunchao Zhang link->basicunit = MPIU_COMPLEX; 54740e23c03SJunchao Zhang #endif 54840e23c03SJunchao Zhang } else { 54940e23c03SJunchao Zhang MPI_Aint lb,bytes; 55040e23c03SJunchao Zhang ierr = MPI_Type_get_extent(unit,&lb,&bytes);CHKERRQ(ierr); 55140e23c03SJunchao Zhang if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 55240e23c03SJunchao Zhang if (bytes % sizeof(int)) { /* If the type size is not multiple of int */ 55340e23c03SJunchao Zhang if (bytes%4 == 0) {PackInit_block_char_4(link); link->bs = bytes/4;} /* Note the basic type is char[4] */ 55440e23c03SJunchao Zhang else if (bytes%2 == 0) {PackInit_block_char_2(link); link->bs = bytes/2;} 55540e23c03SJunchao Zhang else {PackInit_block_char_1(link); link->bs = bytes/1;} 55640e23c03SJunchao Zhang link->unitbytes = bytes; 55740e23c03SJunchao Zhang link->basicunit = MPI_CHAR; 55840e23c03SJunchao Zhang } else { 55940e23c03SJunchao Zhang PetscInt nInt = bytes / sizeof(int); 56040e23c03SJunchao Zhang if (nInt%8 == 0) {PackInit_block_int_8(link); link->bs = nInt/8;} /* Note the basic type is int[8] */ 56140e23c03SJunchao Zhang else if (nInt%4 == 0) {PackInit_block_int_4(link); link->bs = nInt/4;} 56240e23c03SJunchao Zhang else if (nInt%2 == 0) {PackInit_block_int_2(link); link->bs = nInt/2;} 56340e23c03SJunchao Zhang else {PackInit_block_int_1(link); link->bs = nInt/1;} 56440e23c03SJunchao Zhang link->unitbytes = bytes; 56540e23c03SJunchao Zhang link->basicunit = MPI_INT; 56640e23c03SJunchao Zhang } 56740e23c03SJunchao Zhang } 56840e23c03SJunchao Zhang if (link->isbuiltin) link->unit = unit; /* builtin datatypes are common. Make it fast */ 56940e23c03SJunchao Zhang else {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 57040e23c03SJunchao Zhang PetscFunctionReturn(0); 57140e23c03SJunchao Zhang } 57240e23c03SJunchao Zhang 57340e23c03SJunchao Zhang PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**UnpackAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*)) 57440e23c03SJunchao Zhang { 57540e23c03SJunchao Zhang PetscFunctionBegin; 57640e23c03SJunchao Zhang *UnpackAndOp = NULL; 57740e23c03SJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->UnpackAndInsert; 57840e23c03SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->UnpackAndAdd; 57940e23c03SJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->UnpackAndMult; 58040e23c03SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->UnpackAndMax; 58140e23c03SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->UnpackAndMin; 58240e23c03SJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->UnpackAndLAND; 58340e23c03SJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->UnpackAndBAND; 58440e23c03SJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->UnpackAndLOR; 58540e23c03SJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->UnpackAndBOR; 58640e23c03SJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->UnpackAndLXOR; 58740e23c03SJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->UnpackAndBXOR; 58840e23c03SJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->UnpackAndMaxloc; 58940e23c03SJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->UnpackAndMinloc; 59040e23c03SJunchao Zhang else *UnpackAndOp = NULL; 59140e23c03SJunchao Zhang PetscFunctionReturn(0); 59240e23c03SJunchao Zhang } 59340e23c03SJunchao Zhang 59440e23c03SJunchao Zhang PetscErrorCode PetscSFPackGetFetchAndOp(PetscSF sf,PetscSFPack link,MPI_Op op,PetscErrorCode (**FetchAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*)) 59540e23c03SJunchao Zhang { 59640e23c03SJunchao Zhang PetscFunctionBegin; 59740e23c03SJunchao Zhang *FetchAndOp = NULL; 59840e23c03SJunchao Zhang if (op == MPIU_REPLACE) *FetchAndOp = link->FetchAndInsert; 59940e23c03SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->FetchAndAdd; 60040e23c03SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->FetchAndMax; 60140e23c03SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->FetchAndMin; 60240e23c03SJunchao Zhang else if (op == MPI_MAXLOC) *FetchAndOp = link->FetchAndMaxloc; 60340e23c03SJunchao Zhang else if (op == MPI_MINLOC) *FetchAndOp = link->FetchAndMinloc; 60440e23c03SJunchao Zhang else if (op == MPI_PROD) *FetchAndOp = link->FetchAndMult; 60540e23c03SJunchao Zhang else if (op == MPI_LAND) *FetchAndOp = link->FetchAndLAND; 60640e23c03SJunchao Zhang else if (op == MPI_BAND) *FetchAndOp = link->FetchAndBAND; 60740e23c03SJunchao Zhang else if (op == MPI_LOR) *FetchAndOp = link->FetchAndLOR; 60840e23c03SJunchao Zhang else if (op == MPI_BOR) *FetchAndOp = link->FetchAndBOR; 60940e23c03SJunchao Zhang else if (op == MPI_LXOR) *FetchAndOp = link->FetchAndLXOR; 61040e23c03SJunchao Zhang else if (op == MPI_BXOR) *FetchAndOp = link->FetchAndBXOR; 61140e23c03SJunchao Zhang else SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_SUP,"No support for MPI_Op"); 61240e23c03SJunchao Zhang PetscFunctionReturn(0); 61340e23c03SJunchao Zhang } 61440e23c03SJunchao Zhang 61540e23c03SJunchao Zhang /* 61640e23c03SJunchao Zhang Setup pack/unpack optimization plans based on indice patterns available 61740e23c03SJunchao Zhang 61840e23c03SJunchao Zhang Input Parameters: 61940e23c03SJunchao Zhang + n - number of target processors 62040e23c03SJunchao Zhang . offset - [n+1] for the i-th processor, its associated indices are idx[offset[i], offset[i+1]) 62140e23c03SJunchao Zhang - idx - [] array storing indices. Its length is offset[n+1] 62240e23c03SJunchao Zhang 62340e23c03SJunchao Zhang Output Parameters: 62440e23c03SJunchao Zhang + opt - the optimization 62540e23c03SJunchao Zhang */ 62640e23c03SJunchao Zhang PetscErrorCode PetscSFPackSetupOptimization(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 62740e23c03SJunchao Zhang { 62840e23c03SJunchao Zhang PetscErrorCode ierr; 62940e23c03SJunchao Zhang PetscInt i,j,k,n_copies,tot_copies=0,step; 63040e23c03SJunchao Zhang PetscBool strided,has_strided=PETSC_FALSE,has_optimized=PETSC_FALSE; 63140e23c03SJunchao Zhang PetscSFPackOpt opt; 63240e23c03SJunchao Zhang 63340e23c03SJunchao Zhang PetscFunctionBegin; 63440e23c03SJunchao Zhang ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr); 63540e23c03SJunchao Zhang ierr = PetscCalloc2(n,&opt->optimized,n+1,&opt->copy_offset);CHKERRQ(ierr); 63640e23c03SJunchao Zhang 63740e23c03SJunchao Zhang /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */ 63840e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each target processor */ 63940e23c03SJunchao Zhang /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */ 64040e23c03SJunchao Zhang n_copies = 1; 64140e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 64240e23c03SJunchao Zhang if (idx[j]+1 != idx[j+1]) n_copies++; 64340e23c03SJunchao Zhang } 64440e23c03SJunchao Zhang /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32, 64540e23c03SJunchao Zhang then it is worth using memcpy for this target. 32 is an arbitrarily chosen number. 64640e23c03SJunchao Zhang */ 64740e23c03SJunchao Zhang if ((offset[i+1]-offset[i])/n_copies >= 32) { 64840e23c03SJunchao Zhang opt->optimized[i] = PETSC_TRUE; 64940e23c03SJunchao Zhang has_optimized = PETSC_TRUE; 65040e23c03SJunchao Zhang tot_copies += n_copies; 65140e23c03SJunchao Zhang } 65240e23c03SJunchao Zhang } 65340e23c03SJunchao Zhang 65440e23c03SJunchao Zhang /* Setup memcpy plan for each contiguous piece */ 65540e23c03SJunchao Zhang k = 0; /* k-th copy */ 65640e23c03SJunchao Zhang ierr = PetscMalloc2(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length);CHKERRQ(ierr); 65740e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each target processor procs[i] */ 65840e23c03SJunchao Zhang if (opt->optimized[i]) { 65940e23c03SJunchao Zhang n_copies = 1; 66040e23c03SJunchao Zhang opt->copy_start[k] = idx[offset[i]]; 66140e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 66240e23c03SJunchao Zhang if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */ 66340e23c03SJunchao Zhang n_copies++; 66440e23c03SJunchao Zhang opt->copy_start[k+1] = idx[j+1]; 66540e23c03SJunchao Zhang opt->copy_length[k] = idx[j] - opt->copy_start[k] + 1; 66640e23c03SJunchao Zhang k++; 66740e23c03SJunchao Zhang } 66840e23c03SJunchao Zhang } 66940e23c03SJunchao Zhang /* Set copy length of the last copy for this target */ 67040e23c03SJunchao Zhang opt->copy_length[k] = idx[j] - opt->copy_start[k] + 1; 67140e23c03SJunchao Zhang k++; 67240e23c03SJunchao Zhang } 67340e23c03SJunchao Zhang /* Set offset for next target. When optimized[i]=false, copy_offsets[i]=copy_offsets[i+1] */ 67440e23c03SJunchao Zhang opt->copy_offset[i+1] = k; 67540e23c03SJunchao Zhang } 67640e23c03SJunchao Zhang 67740e23c03SJunchao Zhang /* Last chance! If the indices do not have long contiguous pieces, are they strided? */ 67840e23c03SJunchao Zhang ierr = PetscMalloc3(n,&opt->stride_first,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr); 67940e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each remote */ 68040e23c03SJunchao Zhang if (!opt->optimized[i] && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */ 68140e23c03SJunchao Zhang strided = PETSC_TRUE; 68240e23c03SJunchao Zhang step = idx[offset[i]+1] - idx[offset[i]]; 68340e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 68440e23c03SJunchao Zhang if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; } 68540e23c03SJunchao Zhang } 68640e23c03SJunchao Zhang if (strided) { 68740e23c03SJunchao Zhang opt->optimized[i] = PETSC_TRUE; 68840e23c03SJunchao Zhang opt->stride_first[i] = idx[offset[i]]; 68940e23c03SJunchao Zhang opt->stride_step[i] = step; 69040e23c03SJunchao Zhang opt->stride_n[i] = offset[i+1] - offset[i]; 69140e23c03SJunchao Zhang has_strided = PETSC_TRUE; 69240e23c03SJunchao Zhang has_optimized = PETSC_TRUE; 69340e23c03SJunchao Zhang } 69440e23c03SJunchao Zhang } 69540e23c03SJunchao Zhang } 69640e23c03SJunchao Zhang /* If no target has been stride-optimized or optimized, free related arrays to save memory */ 69740e23c03SJunchao Zhang if (!has_strided) {ierr = PetscFree3(opt->stride_first,opt->stride_step,opt->stride_n);CHKERRQ(ierr);} 69840e23c03SJunchao Zhang if (!has_optimized) { 69940e23c03SJunchao Zhang ierr = PetscFree2(opt->optimized,opt->copy_offset);CHKERRQ(ierr); 70040e23c03SJunchao Zhang ierr = PetscFree2(opt->copy_start,opt->copy_length);CHKERRQ(ierr); 70140e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 70240e23c03SJunchao Zhang *out = NULL; 70340e23c03SJunchao Zhang } else *out = opt; 70440e23c03SJunchao Zhang PetscFunctionReturn(0); 70540e23c03SJunchao Zhang } 70640e23c03SJunchao Zhang 70740e23c03SJunchao Zhang PetscErrorCode PetscSFPackDestoryOptimization(PetscSFPackOpt *out) 70840e23c03SJunchao Zhang { 70940e23c03SJunchao Zhang PetscErrorCode ierr; 71040e23c03SJunchao Zhang PetscSFPackOpt opt = *out; 71140e23c03SJunchao Zhang 71240e23c03SJunchao Zhang PetscFunctionBegin; 71340e23c03SJunchao Zhang if (opt) { 71440e23c03SJunchao Zhang ierr = PetscFree2(opt->optimized,opt->copy_offset);CHKERRQ(ierr); 71540e23c03SJunchao Zhang ierr = PetscFree2(opt->copy_start,opt->copy_length);CHKERRQ(ierr); 71640e23c03SJunchao Zhang ierr = PetscFree3(opt->stride_first,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 71740e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 71840e23c03SJunchao Zhang *out = NULL; 71940e23c03SJunchao Zhang } 72040e23c03SJunchao Zhang PetscFunctionReturn(0); 72140e23c03SJunchao Zhang } 722