140e23c03SJunchao Zhang 240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 440e23c03SJunchao Zhang 5eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 6eb02082bSJunchao Zhang #include <cuda_runtime.h> 7eb02082bSJunchao Zhang #endif 840e23c03SJunchao Zhang /* 940e23c03SJunchao Zhang * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 1040e23c03SJunchao Zhang * therefore we pack data types manually. This file defines packing routines for the standard data types. 1140e23c03SJunchao Zhang */ 1240e23c03SJunchao Zhang 13b23bfdefSJunchao Zhang #define CPPJoin2(a,b) a ##_## b 14b23bfdefSJunchao Zhang #define CPPJoin3(a,b,c) a ##_## b ##_## c 15b23bfdefSJunchao Zhang #define CPPJoin4_(a,b,c,d) a##b##_##c##_##d 16b23bfdefSJunchao Zhang #define CPPJoin4(a,b,c,d) CPPJoin4_(a##_,b,c,d) 1740e23c03SJunchao Zhang 1840e23c03SJunchao Zhang #define EXECUTE(statement) statement /* no braces since the statement might declare a variable; braces impose an unwanted scope */ 1940e23c03SJunchao Zhang #define IGNORE(statement) do {} while(0) 2040e23c03SJunchao Zhang 2140e23c03SJunchao Zhang #define BINARY_OP(r,s,op,t) do {(r) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 2240e23c03SJunchao Zhang #define FUNCTION_OP(r,s,op,t) do {(r) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 23eb02082bSJunchao Zhang #define LXOR_OP(r,s,op,t) do {(r) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */ 2440e23c03SJunchao Zhang #define PAIRTYPE_OP(r,s,op,t) do {(r).a = (s).a op (t).a; (r).b = (s).b op (t).b;} while(0) 2540e23c03SJunchao Zhang 26b23bfdefSJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2 /* typename for struct {Type1 a; Type2 b;} */ 2740e23c03SJunchao Zhang 2840e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine 2940e23c03SJunchao Zhang 3040e23c03SJunchao Zhang Arguments of the macro: 31b23bfdefSJunchao Zhang +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 32b23bfdefSJunchao Zhang .BS Block size for vectorization. It is a factor of bs. 33b23bfdefSJunchao Zhang -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 3440e23c03SJunchao Zhang 3540e23c03SJunchao Zhang Arguments of the Pack routine: 36b23bfdefSJunchao Zhang +count Number of indices in idx[] 37b23bfdefSJunchao Zhang .idx Indices of entries to packed. NULL means contiguous indices, that is [0,count) 38eb02082bSJunchao Zhang .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39b23bfdefSJunchao Zhang .opt Pack optimization plans. NULL means no plan at all. 40b23bfdefSJunchao Zhang .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count) 41b23bfdefSJunchao Zhang -packed Address of the packed data for each rank 4240e23c03SJunchao Zhang */ 43b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \ 44eb02082bSJunchao Zhang static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,const void *unpacked,void *packed) \ 45b23bfdefSJunchao Zhang { \ 4640e23c03SJunchao Zhang PetscErrorCode ierr; \ 47b23bfdefSJunchao Zhang const Type *u = (const Type*)unpacked,*u2; \ 48b23bfdefSJunchao Zhang Type *p = (Type*)packed,*p2; \ 49eb02082bSJunchao Zhang PetscInt i,j,k,l,r,step,bs=link->bs; \ 50b23bfdefSJunchao Zhang const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 51b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 5240e23c03SJunchao Zhang PetscFunctionBegin; \ 53b23bfdefSJunchao Zhang if (!idx) {ierr = PetscArraycpy(p,u,MBS*count);CHKERRQ(ierr);} /* Indices are contiguous */ \ 54b23bfdefSJunchao Zhang else if (!opt) { /* No optimizations available */ \ 55b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 56eb02082bSJunchao Zhang for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 57eb02082bSJunchao Zhang for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 58b23bfdefSJunchao Zhang p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 59b23bfdefSJunchao Zhang } else { \ 60b23bfdefSJunchao Zhang for (r=0; r<opt->n; r++) { \ 61b23bfdefSJunchao Zhang p2 = p + opt->offset[r]*MBS; \ 62b23bfdefSJunchao Zhang if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 63b23bfdefSJunchao Zhang idx2 = idx + opt->offset[r]; \ 64b23bfdefSJunchao Zhang for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 65b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 66b23bfdefSJunchao Zhang for (k=0; k<BS; k++) \ 67b23bfdefSJunchao Zhang p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k]; \ 68b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 6940e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \ 70b23bfdefSJunchao Zhang u2 = u + idx[opt->copy_start[i]]*MBS; \ 71b23bfdefSJunchao Zhang l = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */ \ 72b23bfdefSJunchao Zhang ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr); \ 73b23bfdefSJunchao Zhang p2 += l; \ 7440e23c03SJunchao Zhang } \ 75b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 76b23bfdefSJunchao Zhang u2 = u + idx[opt->offset[r]]*MBS; \ 7740e23c03SJunchao Zhang step = opt->stride_step[r]; \ 7840e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 79b23bfdefSJunchao Zhang for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j]; \ 80b23bfdefSJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 8140e23c03SJunchao Zhang } \ 8240e23c03SJunchao Zhang } \ 8340e23c03SJunchao Zhang PetscFunctionReturn(0); \ 8440e23c03SJunchao Zhang } 8540e23c03SJunchao Zhang 8640e23c03SJunchao Zhang /* DEF_Action - macro defining a Unpack(Fetch)AndInsert routine 8740e23c03SJunchao Zhang 8840e23c03SJunchao Zhang Arguments: 8940e23c03SJunchao Zhang +action Unpack or Fetch 90b23bfdefSJunchao Zhang .Type Type of the data 9140e23c03SJunchao Zhang .BS Block size for vectorization 92b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 9340e23c03SJunchao Zhang .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 94b23bfdefSJunchao Zhang .CType Type with or without the const qualifier, i.e., const Type or Type 95b23bfdefSJunchao Zhang .Cvoid void with or without the const qualifier, i.e., const void or void 9640e23c03SJunchao Zhang 9740e23c03SJunchao Zhang Notes: 9840e23c03SJunchao Zhang This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 99b23bfdefSJunchao Zhang The two arguments CType and Cvoid are used (instead of one constness argument), because we want to 10040e23c03SJunchao Zhang get rid of compilation warning "empty macro arguments are undefined in ISO C90". With one constness argument, 10140e23c03SJunchao Zhang sometimes we input 'const', sometimes we have to input empty. 102b23bfdefSJunchao Zhang 103b23bfdefSJunchao Zhang If action is Fetch, we may do Malloc/Free in the routine. It is costly but the expectation is that this case is really rare. 10440e23c03SJunchao Zhang */ 105b23bfdefSJunchao Zhang #define DEF_Action(action,Type,BS,EQ,FILTER,CType,Cvoid) \ 106eb02082bSJunchao Zhang static PetscErrorCode CPPJoin4(action##AndInsert,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \ 107b23bfdefSJunchao Zhang { \ 10840e23c03SJunchao Zhang PetscErrorCode ierr; \ 109b23bfdefSJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 110b23bfdefSJunchao Zhang CType *p = (CType*)packed,*p2; \ 111eb02082bSJunchao Zhang PetscInt i,j,k,l,r,step,bs=link->bs; \ 112b23bfdefSJunchao Zhang const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 113b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 11440e23c03SJunchao Zhang PetscFunctionBegin; \ 115b23bfdefSJunchao Zhang if (!idx) { \ 116b23bfdefSJunchao Zhang FILTER(Type *v); \ 117b23bfdefSJunchao Zhang FILTER(ierr = PetscMalloc1(count*MBS,&v);CHKERRQ(ierr)); \ 118b23bfdefSJunchao Zhang FILTER(ierr = PetscArraycpy(v,u,count*MBS);CHKERRQ(ierr)); \ 119b23bfdefSJunchao Zhang ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr); \ 120b23bfdefSJunchao Zhang FILTER(ierr = PetscArraycpy(p,v,count*MBS);CHKERRQ(ierr)); \ 12140e23c03SJunchao Zhang FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 122b23bfdefSJunchao Zhang } else if (!opt) { /* No optimizations available */ \ 123b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 124b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 125b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 126b23bfdefSJunchao Zhang FILTER(Type t = u[idx[i]*MBS+j*BS+k]); \ 127b23bfdefSJunchao Zhang u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 128b23bfdefSJunchao Zhang FILTER(p[i*MBS+j*BS+k] = t); \ 12940e23c03SJunchao Zhang } \ 130b23bfdefSJunchao Zhang } else { \ 131b23bfdefSJunchao Zhang for (r=0; r<opt->n; r++) { \ 132b23bfdefSJunchao Zhang p2 = p + opt->offset[r]*MBS; \ 133b23bfdefSJunchao Zhang if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 134b23bfdefSJunchao Zhang idx2 = idx + opt->offset[r]; \ 135b23bfdefSJunchao Zhang for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 136b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 137b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 138b23bfdefSJunchao Zhang FILTER(Type t = u[idx2[i]*MBS+j*BS+k]); \ 139b23bfdefSJunchao Zhang u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 140b23bfdefSJunchao Zhang FILTER(p2[i*MBS+j*BS+k] = t); \ 14140e23c03SJunchao Zhang } \ 142b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 143b23bfdefSJunchao Zhang FILTER(Type *v); \ 144b23bfdefSJunchao Zhang FILTER(ierr = PetscMalloc1((opt->offset[r+1]-opt->offset[r])*MBS,&v);CHKERRQ(ierr)); /* max buf */ \ 14540e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 146b23bfdefSJunchao Zhang u2 = u + idx[opt->copy_start[i]]*MBS; \ 147b23bfdefSJunchao Zhang l = opt->copy_length[i]*MBS; \ 148da2e4c71SJunchao Zhang FILTER(ierr = PetscArraycpy(v,u2,l);CHKERRQ(ierr)); \ 149b23bfdefSJunchao Zhang ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr); \ 150b23bfdefSJunchao Zhang FILTER(ierr = PetscArraycpy(p2,v,l);CHKERRQ(ierr)); \ 151b23bfdefSJunchao Zhang p2 += l; \ 15240e23c03SJunchao Zhang } \ 15340e23c03SJunchao Zhang FILTER(ierr = PetscFree(v);CHKERRQ(ierr)); \ 154b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 155b23bfdefSJunchao Zhang u2 = u + idx[opt->offset[r]]*MBS; \ 15640e23c03SJunchao Zhang step = opt->stride_step[r]; \ 15740e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 158b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 159b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 160b23bfdefSJunchao Zhang FILTER(Type t = u2[i*step*MBS+j*BS+k]); \ 161b23bfdefSJunchao Zhang u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 162b23bfdefSJunchao Zhang FILTER(p2[i*MBS+j*BS+k] = t); \ 16340e23c03SJunchao Zhang } \ 164b23bfdefSJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 16540e23c03SJunchao Zhang } \ 16640e23c03SJunchao Zhang } \ 16740e23c03SJunchao Zhang PetscFunctionReturn(0); \ 16840e23c03SJunchao Zhang } 16940e23c03SJunchao Zhang 17040e23c03SJunchao Zhang /* DEF_ActionAndOp - macro defining a Unpack(Fetch)AndOp routine. Op can not be Insert, Maxloc or Minloc 17140e23c03SJunchao Zhang 17240e23c03SJunchao Zhang Arguments: 17340e23c03SJunchao Zhang +action Unpack or Fetch 17440e23c03SJunchao Zhang .opname Name of the Op, such as Add, Mult, LAND, etc. 175b23bfdefSJunchao Zhang .Type Type of the data 17640e23c03SJunchao Zhang .BS Block size for vectorization 177b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 17840e23c03SJunchao Zhang .op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 17940e23c03SJunchao Zhang .APPLY Macro defining application of the op. Could be BINARY_OP, FUNCTION_OP, LXOR_OP or PAIRTYPE_OP 18040e23c03SJunchao Zhang .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 181b23bfdefSJunchao Zhang .CType Type with or without the const qualifier, i.e., const Type or Type 182b23bfdefSJunchao Zhang -Cvoid void with or without the const qualifier, i.e., const void or void 18340e23c03SJunchao Zhang */ 184b23bfdefSJunchao Zhang #define DEF_ActionAndOp(action,opname,Type,BS,EQ,op,APPLY,FILTER,CType,Cvoid) \ 185eb02082bSJunchao Zhang static PetscErrorCode CPPJoin4(action##And##opname,Type,BS,EQ)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) \ 186b23bfdefSJunchao Zhang { \ 187b23bfdefSJunchao Zhang Type *u = (Type*)unpacked,*u2,t; \ 188b23bfdefSJunchao Zhang CType *p = (CType*)packed,*p2; \ 189eb02082bSJunchao Zhang PetscInt i,j,k,l,r,step,bs=link->bs; \ 190b23bfdefSJunchao Zhang const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 191b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 19240e23c03SJunchao Zhang PetscFunctionBegin; \ 193b23bfdefSJunchao Zhang if (!idx) { \ 194b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 195b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 196b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 197b23bfdefSJunchao Zhang t = u[i*MBS+j*BS+k]; \ 198b23bfdefSJunchao Zhang APPLY (u[i*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \ 199b23bfdefSJunchao Zhang FILTER(p[i*MBS+j*BS+k] = t); \ 20040e23c03SJunchao Zhang } \ 201b23bfdefSJunchao Zhang } else if (!opt) { /* No optimizations available */ \ 202b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 203b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 204b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 205b23bfdefSJunchao Zhang t = u[idx[i]*MBS+j*BS+k]; \ 206b23bfdefSJunchao Zhang APPLY (u[idx[i]*MBS+j*BS+k],t,op,p[i*MBS+j*BS+k]); \ 207b23bfdefSJunchao Zhang FILTER(p[i*MBS+j*BS+k] = t); \ 20840e23c03SJunchao Zhang } \ 209b23bfdefSJunchao Zhang } else { \ 210b23bfdefSJunchao Zhang for (r=0; r<opt->n; r++) { \ 211b23bfdefSJunchao Zhang p2 = p + opt->offset[r]*MBS; \ 212b23bfdefSJunchao Zhang if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 213b23bfdefSJunchao Zhang idx2 = idx + opt->offset[r]; \ 214b23bfdefSJunchao Zhang for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 215b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 216b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 217b23bfdefSJunchao Zhang t = u[idx2[i]*MBS+j*BS+k]; \ 218b23bfdefSJunchao Zhang APPLY (u[idx2[i]*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \ 219b23bfdefSJunchao Zhang FILTER(p2[i*MBS+j*BS+k] = t); \ 22040e23c03SJunchao Zhang } \ 221b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 22240e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 223b23bfdefSJunchao Zhang u2 = u + idx[opt->copy_start[i]]*MBS; \ 224b23bfdefSJunchao Zhang l = opt->copy_length[i]*MBS; \ 22540e23c03SJunchao Zhang for (j=0; j<l; j++) { \ 22640e23c03SJunchao Zhang t = u2[j]; \ 227b23bfdefSJunchao Zhang APPLY (u2[j],t,op,p2[j]); \ 228b23bfdefSJunchao Zhang FILTER(p2[j] = t); \ 22940e23c03SJunchao Zhang } \ 230b23bfdefSJunchao Zhang p2 += l; \ 23140e23c03SJunchao Zhang } \ 232b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 233b23bfdefSJunchao Zhang u2 = u + idx[opt->offset[r]]*MBS; \ 23440e23c03SJunchao Zhang step = opt->stride_step[r]; \ 23540e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 236b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 237b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 238b23bfdefSJunchao Zhang t = u2[i*step*MBS+j*BS+k]; \ 239b23bfdefSJunchao Zhang APPLY (u2[i*step*MBS+j*BS+k],t,op,p2[i*MBS+j*BS+k]); \ 240b23bfdefSJunchao Zhang FILTER(p2[i*MBS+j*BS+k] = t); \ 24140e23c03SJunchao Zhang } \ 242b23bfdefSJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 24340e23c03SJunchao Zhang } \ 24440e23c03SJunchao Zhang } \ 24540e23c03SJunchao Zhang PetscFunctionReturn(0); \ 24640e23c03SJunchao Zhang } 24740e23c03SJunchao Zhang 24840e23c03SJunchao Zhang /* DEF_ActionAndXloc - macro defining a Unpack(Fetch)AndMaxloc(Minloc) routine 24940e23c03SJunchao Zhang 25040e23c03SJunchao Zhang Arguments: 25140e23c03SJunchao Zhang +Action Unpack or Fetch 25240e23c03SJunchao Zhang .locname Max or Min 25340e23c03SJunchao Zhang .type1 Type of the first data in a pair type 25440e23c03SJunchao Zhang .type2 Type of the second data in a pair type, usually PetscMPIInt for MPI ranks. 25540e23c03SJunchao Zhang .op > or < 25640e23c03SJunchao Zhang .FILTER Macro defining what to do with a statement, either EXECUTE or IGNORE 257b23bfdefSJunchao Zhang .CType Type with or without the const qualifier, i.e., const PairType(Type1,Type2) or PairType(Type1,Type2) 258b23bfdefSJunchao Zhang -Cvoid void with or without the const qualifier, i.e., const void or void 25940e23c03SJunchao Zhang */ 260b23bfdefSJunchao Zhang #define DEF_ActionAndXloc(action,locname,Type1,Type2,op,FILTER,CType,Cvoid) \ 261eb02082bSJunchao Zhang static PetscErrorCode CPPJoin4(action##And##locname##loc,PairType(Type1,Type2),1,1)(PetscInt count,const PetscInt *idx,PetscSFPack link,PetscSFPackOpt opt,void *unpacked,Cvoid *packed) { \ 262b23bfdefSJunchao Zhang PairType(Type1,Type2) *u = (PairType(Type1,Type2)*)unpacked; \ 263b23bfdefSJunchao Zhang CType *p = (CType*)packed; \ 264b23bfdefSJunchao Zhang PetscInt i,j; \ 265b23bfdefSJunchao Zhang for (i=0; i<count; i++) { \ 266eb02082bSJunchao Zhang FILTER(PairType(Type1,Type2) v); \ 267b23bfdefSJunchao Zhang j = idx? idx[i] : i; \ 268eb02082bSJunchao Zhang FILTER(v = u[j]); \ 26940e23c03SJunchao Zhang if (p[i].a op u[j].a) { \ 27040e23c03SJunchao Zhang u[j] = p[i]; \ 27140e23c03SJunchao Zhang } else if (p[i].a == u[j].a) { \ 27240e23c03SJunchao Zhang u[j].b = PetscMin(u[j].b,p[i].b); /* Minimal rank. Ref MPI MAXLOC */ \ 27340e23c03SJunchao Zhang } \ 27440e23c03SJunchao Zhang FILTER(p[i] = v); \ 27540e23c03SJunchao Zhang } \ 27640e23c03SJunchao Zhang PetscFunctionReturn(0); \ 27740e23c03SJunchao Zhang } 27840e23c03SJunchao Zhang 279b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */ 280b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ) \ 281b23bfdefSJunchao Zhang DEF_PackFunc(Type,BS,EQ) \ 282b23bfdefSJunchao Zhang DEF_Action(Unpack,Type,BS,EQ,IGNORE,const Type,const void) \ 283b23bfdefSJunchao Zhang DEF_Action(Fetch, Type,BS,EQ,EXECUTE,Type,void) \ 284b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFPack link) { \ 285eb02082bSJunchao Zhang link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 286eb02082bSJunchao Zhang link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 287eb02082bSJunchao Zhang link->h_FetchAndInsert = CPPJoin4(FetchAndInsert, Type,BS,EQ); \ 28840e23c03SJunchao Zhang } 28940e23c03SJunchao Zhang 290b23bfdefSJunchao Zhang /* Add, Mult ops */ 291b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ) \ 292b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,Add, Type,BS,EQ,+,BINARY_OP,IGNORE,const Type,const void) \ 293b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,Mult,Type,BS,EQ,*,BINARY_OP,IGNORE,const Type,const void) \ 294b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, Add, Type,BS,EQ,+,BINARY_OP,EXECUTE,Type,void) \ 295b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, Mult,Type,BS,EQ,*,BINARY_OP,EXECUTE,Type,void) \ 296b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFPack link) { \ 297eb02082bSJunchao Zhang link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 298eb02082bSJunchao Zhang link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 299eb02082bSJunchao Zhang link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 300eb02082bSJunchao Zhang link->h_FetchAndMult = CPPJoin4(FetchAndMult, Type,BS,EQ); \ 30140e23c03SJunchao Zhang } 30240e23c03SJunchao Zhang 303b23bfdefSJunchao Zhang /* Max, Min ops */ 304b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ) \ 305b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,Max,Type,BS,EQ,PetscMax,FUNCTION_OP,IGNORE,const Type,const void) \ 306b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,Min,Type,BS,EQ,PetscMin,FUNCTION_OP,IGNORE,const Type,const void) \ 307b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, Max,Type,BS,EQ,PetscMax,FUNCTION_OP,EXECUTE,Type,void) \ 308b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, Min,Type,BS,EQ,PetscMin,FUNCTION_OP,EXECUTE,Type,void) \ 309b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFPack link) { \ 310eb02082bSJunchao Zhang link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 311eb02082bSJunchao Zhang link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 312eb02082bSJunchao Zhang link->h_FetchAndMax = CPPJoin4(FetchAndMax , Type,BS,EQ); \ 313eb02082bSJunchao Zhang link->h_FetchAndMin = CPPJoin4(FetchAndMin , Type,BS,EQ); \ 314b23bfdefSJunchao Zhang } 315b23bfdefSJunchao Zhang 316b23bfdefSJunchao Zhang /* Logical ops. 317b23bfdefSJunchao Zhang The operator in LXOR_OP should be empty but is &. It is not used. Put here to avoid 31840e23c03SJunchao Zhang the compilation warning "empty macro arguments are undefined in ISO C90" 31940e23c03SJunchao Zhang */ 320b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ) \ 321b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,LAND,Type,BS,EQ,&&,BINARY_OP,IGNORE,const Type,const void) \ 322b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,LOR, Type,BS,EQ,||,BINARY_OP,IGNORE,const Type,const void) \ 323b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,LXOR,Type,BS,EQ,&, LXOR_OP, IGNORE,const Type,const void) \ 324b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, LAND,Type,BS,EQ,&&,BINARY_OP,EXECUTE,Type,void) \ 325b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, LOR, Type,BS,EQ,||,BINARY_OP,EXECUTE,Type,void) \ 326b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, LXOR,Type,BS,EQ,&, LXOR_OP, EXECUTE,Type,void) \ 327b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFPack link) { \ 328eb02082bSJunchao Zhang link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND,Type,BS,EQ); \ 329eb02082bSJunchao Zhang link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 330eb02082bSJunchao Zhang link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR,Type,BS,EQ); \ 331eb02082bSJunchao Zhang link->h_FetchAndLAND = CPPJoin4(FetchAndLAND, Type,BS,EQ); \ 332eb02082bSJunchao Zhang link->h_FetchAndLOR = CPPJoin4(FetchAndLOR, Type,BS,EQ); \ 333eb02082bSJunchao Zhang link->h_FetchAndLXOR = CPPJoin4(FetchAndLXOR, Type,BS,EQ); \ 33440e23c03SJunchao Zhang } 33540e23c03SJunchao Zhang 336b23bfdefSJunchao Zhang /* Bitwise ops */ 337b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ) \ 338b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,BAND,Type,BS,EQ,&,BINARY_OP,IGNORE,const Type,const void) \ 339b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,BOR, Type,BS,EQ,|,BINARY_OP,IGNORE,const Type,const void) \ 340b23bfdefSJunchao Zhang DEF_ActionAndOp(Unpack,BXOR,Type,BS,EQ,^,BINARY_OP,IGNORE,const Type,const void) \ 341b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, BAND,Type,BS,EQ,&,BINARY_OP,EXECUTE,Type,void) \ 342b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, BOR, Type,BS,EQ,|,BINARY_OP,EXECUTE,Type,void) \ 343b23bfdefSJunchao Zhang DEF_ActionAndOp(Fetch, BXOR,Type,BS,EQ,^,BINARY_OP,EXECUTE,Type,void) \ 344b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFPack link) { \ 345eb02082bSJunchao Zhang link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND,Type,BS,EQ); \ 346eb02082bSJunchao Zhang link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 347eb02082bSJunchao Zhang link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR,Type,BS,EQ); \ 348eb02082bSJunchao Zhang link->h_FetchAndBAND = CPPJoin4(FetchAndBAND, Type,BS,EQ); \ 349eb02082bSJunchao Zhang link->h_FetchAndBOR = CPPJoin4(FetchAndBOR, Type,BS,EQ); \ 350eb02082bSJunchao Zhang link->h_FetchAndBXOR = CPPJoin4(FetchAndBXOR, Type,BS,EQ); \ 35140e23c03SJunchao Zhang } 35240e23c03SJunchao Zhang 353b23bfdefSJunchao Zhang /* Maxloc, Minloc */ 354b23bfdefSJunchao Zhang #define DEF_Xloc(Type1,Type2) \ 355b23bfdefSJunchao Zhang DEF_ActionAndXloc(Unpack,Max,Type1,Type2,>,IGNORE,const PairType(Type1,Type2),const void) \ 356b23bfdefSJunchao Zhang DEF_ActionAndXloc(Unpack,Min,Type1,Type2,<,IGNORE,const PairType(Type1,Type2),const void) \ 357b23bfdefSJunchao Zhang DEF_ActionAndXloc(Fetch, Max,Type1,Type2,>,EXECUTE,PairType(Type1,Type2),void) \ 358b23bfdefSJunchao Zhang DEF_ActionAndXloc(Fetch, Min,Type1,Type2,<,EXECUTE,PairType(Type1,Type2),void) \ 359b23bfdefSJunchao Zhang static void CPPJoin3(PackInit_Xloc,Type1,Type2)(PetscSFPack link) { \ 360eb02082bSJunchao Zhang link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMaxloc,PairType(Type1,Type2),1,1); \ 361eb02082bSJunchao Zhang link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMinloc,PairType(Type1,Type2),1,1); \ 362eb02082bSJunchao Zhang link->h_FetchAndMaxloc = CPPJoin4(FetchAndMaxloc, PairType(Type1,Type2),1,1); \ 363eb02082bSJunchao Zhang link->h_FetchAndMinloc = CPPJoin4(FetchAndMinloc, PairType(Type1,Type2),1,1); \ 36440e23c03SJunchao Zhang } 36540e23c03SJunchao Zhang 366b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ) \ 367b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 368b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 369b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 370b23bfdefSJunchao Zhang DEF_Log(Type,BS,EQ) \ 371b23bfdefSJunchao Zhang DEF_Bit(Type,BS,EQ) \ 372b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFPack link) { \ 373b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 375b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 376b23bfdefSJunchao Zhang CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 377b23bfdefSJunchao Zhang CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 37840e23c03SJunchao Zhang } 37940e23c03SJunchao Zhang 380b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ) \ 381b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 382b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 383b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 384b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFPack link) { \ 385b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 386b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 387b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 388b23bfdefSJunchao Zhang } 38940e23c03SJunchao Zhang 39040e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 391b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ) \ 392b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 393b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 394b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFPack link) { \ 395b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 396b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 397b23bfdefSJunchao Zhang } 39840e23c03SJunchao Zhang #endif 399b23bfdefSJunchao Zhang 400b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ) \ 401b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 402b23bfdefSJunchao Zhang static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFPack link) { \ 403b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 404b23bfdefSJunchao Zhang } 405b23bfdefSJunchao Zhang 406b23bfdefSJunchao Zhang /* Maxloc, Minloc */ 407b23bfdefSJunchao Zhang #define DEF_PairType(Type1,Type2) \ 408b23bfdefSJunchao Zhang typedef struct {Type1 a; Type2 b;} PairType(Type1,Type2); \ 409b23bfdefSJunchao Zhang DEF_Pack(PairType(Type1,Type2),1,1) \ 410b23bfdefSJunchao Zhang DEF_Xloc(Type1,Type2) \ 411b23bfdefSJunchao Zhang static void CPPJoin3(PackInit_PairType,Type1,Type2)(PetscSFPack link) { \ 412b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,PairType(Type1,Type2),1,1)(link); \ 413b23bfdefSJunchao Zhang CPPJoin3(PackInit_Xloc,Type1,Type2)(link); \ 414b23bfdefSJunchao Zhang } 415b23bfdefSJunchao Zhang 416b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 417b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 418b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 419b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 420b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 421b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 422b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 423b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 424b23bfdefSJunchao Zhang 425b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 426b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1) 427b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1) 428b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1) 429b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1) 430b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0) 431b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0) 432b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0) 433b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0) 434b23bfdefSJunchao Zhang #endif 435b23bfdefSJunchao Zhang 436b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */ 437b23bfdefSJunchao Zhang typedef signed char SignedChar; 438b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1) 439b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1) 440b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1) 441b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1) 442b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0) 443b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0) 444b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0) 445b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0) 446b23bfdefSJunchao Zhang 447b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar; 448b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1) 449b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1) 450b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1) 451b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1) 452b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0) 453b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0) 454b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0) 455b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0) 456b23bfdefSJunchao Zhang 457b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1) 458b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1) 459b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1) 460b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1) 461b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0) 462b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0) 463b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0) 464b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0) 465b23bfdefSJunchao Zhang 466b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 467b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1) 468b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1) 469b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1) 470b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1) 471b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0) 472b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0) 473b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0) 474b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0) 475b23bfdefSJunchao Zhang #endif 476b23bfdefSJunchao Zhang 477b23bfdefSJunchao Zhang DEF_PairType(int,int) 478b23bfdefSJunchao Zhang DEF_PairType(PetscInt,PetscInt) 479b23bfdefSJunchao Zhang 480b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */ 481b23bfdefSJunchao Zhang DEF_DumbType(char,1,1) 482b23bfdefSJunchao Zhang DEF_DumbType(char,2,1) 483b23bfdefSJunchao Zhang DEF_DumbType(char,4,1) 484b23bfdefSJunchao Zhang DEF_DumbType(char,1,0) 485b23bfdefSJunchao Zhang DEF_DumbType(char,2,0) 486b23bfdefSJunchao Zhang DEF_DumbType(char,4,0) 487b23bfdefSJunchao Zhang 488eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 489b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1) 490b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1) 491b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1) 492b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1) 493b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0) 494b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0) 495b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0) 496b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0) 49740e23c03SJunchao Zhang 49840e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 49940e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 50040e23c03SJunchao Zhang { 50140e23c03SJunchao Zhang int ierr; 50240e23c03SJunchao Zhang ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 50340e23c03SJunchao Zhang ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 50440e23c03SJunchao Zhang return MPI_SUCCESS; 50540e23c03SJunchao Zhang } 50640e23c03SJunchao Zhang #endif 50740e23c03SJunchao Zhang 508*637e6665SJunchao Zhang PetscErrorCode PetscSFPackGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFPack *mylink) 50940e23c03SJunchao Zhang { 51040e23c03SJunchao Zhang PetscErrorCode ierr; 51140e23c03SJunchao Zhang PetscSFPack link,*p; 51240e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 51340e23c03SJunchao Zhang 51440e23c03SJunchao Zhang PetscFunctionBegin; 51540e23c03SJunchao Zhang /* Look for types in cache */ 51640e23c03SJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 51740e23c03SJunchao Zhang PetscBool match; 51840e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 519*637e6665SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 52040e23c03SJunchao Zhang switch (cmode) { 52140e23c03SJunchao Zhang case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 52240e23c03SJunchao Zhang case PETSC_USE_POINTER: break; 52340e23c03SJunchao Zhang default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 52440e23c03SJunchao Zhang } 52540e23c03SJunchao Zhang *mylink = link; 52640e23c03SJunchao Zhang PetscFunctionReturn(0); 52740e23c03SJunchao Zhang } 52840e23c03SJunchao Zhang } 52940e23c03SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 53040e23c03SJunchao Zhang PetscFunctionReturn(0); 53140e23c03SJunchao Zhang } 53240e23c03SJunchao Zhang 53340e23c03SJunchao Zhang PetscErrorCode PetscSFPackReclaim(PetscSF sf,PetscSFPack *link) 53440e23c03SJunchao Zhang { 53540e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 53640e23c03SJunchao Zhang 53740e23c03SJunchao Zhang PetscFunctionBegin; 538*637e6665SJunchao Zhang (*link)->rootdata = NULL; 539*637e6665SJunchao Zhang (*link)->leafdata = NULL; 54040e23c03SJunchao Zhang (*link)->next = bas->avail; 54140e23c03SJunchao Zhang bas->avail = *link; 54240e23c03SJunchao Zhang *link = NULL; 54340e23c03SJunchao Zhang PetscFunctionReturn(0); 54440e23c03SJunchao Zhang } 54540e23c03SJunchao Zhang 546eb02082bSJunchao Zhang /* Destroy all links, i.e., PetscSFPacks in the linked list, usually named 'avail' */ 54764f49babSJed Brown PetscErrorCode PetscSFPackDestroyAvailable(PetscSFPack *avail) 548eb02082bSJunchao Zhang { 549eb02082bSJunchao Zhang PetscErrorCode ierr; 550eb02082bSJunchao Zhang PetscSFPack link=*avail,next; 551eb02082bSJunchao Zhang PetscInt i; 552eb02082bSJunchao Zhang 553eb02082bSJunchao Zhang PetscFunctionBegin; 554eb02082bSJunchao Zhang for (; link; link=next) { 555eb02082bSJunchao Zhang next = link->next; 556eb02082bSJunchao Zhang if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 557eb02082bSJunchao Zhang for (i=0; i<(link->nrootreqs+link->nleafreqs)*4; i++) { /* Persistent reqs must be freed. */ 558eb02082bSJunchao Zhang if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 559eb02082bSJunchao Zhang } 560eb02082bSJunchao Zhang ierr = PetscFree(link->reqs);CHKERRQ(ierr); 561eb02082bSJunchao Zhang ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->rootbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 562eb02082bSJunchao Zhang ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->leafbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 563eb02082bSJunchao Zhang ierr = PetscFreeWithMemType(PETSC_MEMTYPE_HOST,link->selfbuf[PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 564eb02082bSJunchao Zhang 565eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 566eb02082bSJunchao Zhang ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 567eb02082bSJunchao Zhang ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 568eb02082bSJunchao Zhang ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->selfbuf[PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 569eb02082bSJunchao Zhang if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;} 570eb02082bSJunchao Zhang #endif 571eb02082bSJunchao Zhang ierr = PetscFree(link);CHKERRQ(ierr); 572eb02082bSJunchao Zhang } 573eb02082bSJunchao Zhang *avail = NULL; 574eb02082bSJunchao Zhang PetscFunctionReturn(0); 575eb02082bSJunchao Zhang } 576eb02082bSJunchao Zhang 5779d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */ 578*637e6665SJunchao Zhang PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 5799d1c8addSJunchao Zhang { 5809d1c8addSJunchao Zhang PetscErrorCode ierr; 5819d1c8addSJunchao Zhang PetscSFPack link,*p; 5829d1c8addSJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 5839d1c8addSJunchao Zhang PetscBool match; 5849d1c8addSJunchao Zhang 5859d1c8addSJunchao Zhang PetscFunctionBegin; 58618fb5014SJunchao Zhang /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 58718fb5014SJunchao Zhang the potential overlapping since this process does not participate in communication. Overlapping is harmless. 58818fb5014SJunchao Zhang */ 589*637e6665SJunchao Zhang if (rootdata || leafdata) { 5909d1c8addSJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 5919d1c8addSJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 592*637e6665SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for overlapped PetscSF communications with the same SF, rootdata(%p), leafdata(%p) and data type. You can undo the overlap to avoid the error.",rootdata,leafdata); 5939d1c8addSJunchao Zhang } 59418fb5014SJunchao Zhang } 5959d1c8addSJunchao Zhang PetscFunctionReturn(0); 5969d1c8addSJunchao Zhang } 5979d1c8addSJunchao Zhang 598eb02082bSJunchao Zhang PetscErrorCode PetscSFPackSetUp_Host(PetscSF sf,PetscSFPack link,MPI_Datatype unit) 59940e23c03SJunchao Zhang { 60040e23c03SJunchao Zhang PetscErrorCode ierr; 601b23bfdefSJunchao Zhang PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 602b23bfdefSJunchao Zhang PetscBool is2Int,is2PetscInt; 60340e23c03SJunchao Zhang PetscMPIInt ni,na,nd,combiner; 60440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 605b23bfdefSJunchao Zhang PetscInt nPetscComplex=0; 60640e23c03SJunchao Zhang #endif 60740e23c03SJunchao Zhang 60840e23c03SJunchao Zhang PetscFunctionBegin; 609b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 610b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 611b23bfdefSJunchao Zhang /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 612b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 613b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 614b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 61540e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 616b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 61740e23c03SJunchao Zhang #endif 61840e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 61940e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 620b23bfdefSJunchao Zhang /* TODO: shaell we also handle Fortran MPI_2REAL? */ 62140e23c03SJunchao Zhang ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 6225ad15460SJunchao Zhang link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 623b23bfdefSJunchao Zhang link->bs = 1; /* default */ 62440e23c03SJunchao Zhang 625eb02082bSJunchao Zhang if (is2Int) { 626eb02082bSJunchao Zhang PackInit_PairType_int_int(link); 627eb02082bSJunchao Zhang link->bs = 1; 628eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(int); 6295ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 630eb02082bSJunchao Zhang link->basicunit = MPI_2INT; 6315ad15460SJunchao Zhang link->unit = MPI_2INT; 632eb02082bSJunchao Zhang } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 633eb02082bSJunchao Zhang PackInit_PairType_PetscInt_PetscInt(link); 634eb02082bSJunchao Zhang link->bs = 1; 635eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(PetscInt); 636eb02082bSJunchao Zhang link->basicunit = MPIU_2INT; 6375ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 6385ad15460SJunchao Zhang link->unit = MPIU_2INT; 639eb02082bSJunchao Zhang } else if (nPetscReal) { 640b23bfdefSJunchao Zhang if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 641b23bfdefSJunchao Zhang else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 642b23bfdefSJunchao Zhang else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 643b23bfdefSJunchao Zhang else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 644b23bfdefSJunchao Zhang link->bs = nPetscReal; 645eb02082bSJunchao Zhang link->unitbytes = nPetscReal*sizeof(PetscReal); 64640e23c03SJunchao Zhang link->basicunit = MPIU_REAL; 6475ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 648b23bfdefSJunchao Zhang } else if (nPetscInt) { 649b23bfdefSJunchao Zhang if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 650b23bfdefSJunchao Zhang else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 651b23bfdefSJunchao Zhang else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 652b23bfdefSJunchao Zhang else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 653b23bfdefSJunchao Zhang link->bs = nPetscInt; 654eb02082bSJunchao Zhang link->unitbytes = nPetscInt*sizeof(PetscInt); 655b23bfdefSJunchao Zhang link->basicunit = MPIU_INT; 6565ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 657b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) 658b23bfdefSJunchao Zhang } else if (nInt) { 659b23bfdefSJunchao Zhang if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 660b23bfdefSJunchao Zhang else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 661b23bfdefSJunchao Zhang else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 662b23bfdefSJunchao Zhang else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 663b23bfdefSJunchao Zhang link->bs = nInt; 664eb02082bSJunchao Zhang link->unitbytes = nInt*sizeof(int); 665b23bfdefSJunchao Zhang link->basicunit = MPI_INT; 6665ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 667b23bfdefSJunchao Zhang #endif 668b23bfdefSJunchao Zhang } else if (nSignedChar) { 669b23bfdefSJunchao Zhang if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 670b23bfdefSJunchao Zhang else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 671b23bfdefSJunchao Zhang else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 672b23bfdefSJunchao Zhang else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 673b23bfdefSJunchao Zhang link->bs = nSignedChar; 674eb02082bSJunchao Zhang link->unitbytes = nSignedChar*sizeof(SignedChar); 675b23bfdefSJunchao Zhang link->basicunit = MPI_SIGNED_CHAR; 6765ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 677b23bfdefSJunchao Zhang } else if (nUnsignedChar) { 678b23bfdefSJunchao Zhang if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 679b23bfdefSJunchao Zhang else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 680b23bfdefSJunchao Zhang else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 681b23bfdefSJunchao Zhang else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 682b23bfdefSJunchao Zhang link->bs = nUnsignedChar; 683eb02082bSJunchao Zhang link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 684b23bfdefSJunchao Zhang link->basicunit = MPI_UNSIGNED_CHAR; 6855ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 68640e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 687b23bfdefSJunchao Zhang } else if (nPetscComplex) { 688b23bfdefSJunchao Zhang if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 689b23bfdefSJunchao Zhang else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 690b23bfdefSJunchao Zhang else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 691b23bfdefSJunchao Zhang else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 692b23bfdefSJunchao Zhang link->bs = nPetscComplex; 693eb02082bSJunchao Zhang link->unitbytes = nPetscComplex*sizeof(PetscComplex); 69440e23c03SJunchao Zhang link->basicunit = MPIU_COMPLEX; 6955ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 69640e23c03SJunchao Zhang #endif 69740e23c03SJunchao Zhang } else { 698b23bfdefSJunchao Zhang MPI_Aint lb,nbyte; 699b23bfdefSJunchao Zhang ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 70040e23c03SJunchao Zhang if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 701eb02082bSJunchao Zhang if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 702eb02082bSJunchao Zhang if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 703eb02082bSJunchao Zhang else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 704eb02082bSJunchao Zhang else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 705eb02082bSJunchao Zhang link->bs = nbyte; 706b23bfdefSJunchao Zhang link->unitbytes = nbyte; 707b23bfdefSJunchao Zhang link->basicunit = MPI_BYTE; 70840e23c03SJunchao Zhang } else { 709eb02082bSJunchao Zhang nInt = nbyte / sizeof(int); 710eb02082bSJunchao Zhang if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 711eb02082bSJunchao Zhang else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 712eb02082bSJunchao Zhang else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 713eb02082bSJunchao Zhang else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 714eb02082bSJunchao Zhang link->bs = nInt; 715b23bfdefSJunchao Zhang link->unitbytes = nbyte; 71640e23c03SJunchao Zhang link->basicunit = MPI_INT; 71740e23c03SJunchao Zhang } 7185ad15460SJunchao Zhang if (link->isbuiltin) link->unit = unit; 71940e23c03SJunchao Zhang } 720b23bfdefSJunchao Zhang 7215ad15460SJunchao Zhang if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 72240e23c03SJunchao Zhang PetscFunctionReturn(0); 72340e23c03SJunchao Zhang } 72440e23c03SJunchao Zhang 725eb02082bSJunchao Zhang PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*)) 72640e23c03SJunchao Zhang { 72740e23c03SJunchao Zhang PetscFunctionBegin; 72840e23c03SJunchao Zhang *UnpackAndOp = NULL; 729eb02082bSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) { 730eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 731eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 732eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 733eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 734eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 735eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 736eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 737eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 738eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 739eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 740eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 741eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 742eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 743eb02082bSJunchao Zhang } 744eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 745eb02082bSJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 746eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 747eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 748eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 749eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 750eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 751eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 752eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 753eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 754eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 755eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 756eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 757eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 758eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 759eb02082bSJunchao Zhang } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 760eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 761eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 762eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 763eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 764eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 765eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 766eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 767eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 768eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 769eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 770eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 771eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 772eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 773eb02082bSJunchao Zhang } 774eb02082bSJunchao Zhang #endif 775eb02082bSJunchao Zhang else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype); 776eb02082bSJunchao Zhang 77740e23c03SJunchao Zhang PetscFunctionReturn(0); 77840e23c03SJunchao Zhang } 77940e23c03SJunchao Zhang 780eb02082bSJunchao Zhang PetscErrorCode PetscSFPackGetFetchAndOp(PetscSFPack link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*)) 78140e23c03SJunchao Zhang { 78240e23c03SJunchao Zhang PetscFunctionBegin; 78340e23c03SJunchao Zhang *FetchAndOp = NULL; 784eb02082bSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) { 785eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *FetchAndOp = link->h_FetchAndInsert; 786eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->h_FetchAndAdd; 787eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->h_FetchAndMax; 788eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->h_FetchAndMin; 789eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *FetchAndOp = link->h_FetchAndMaxloc; 790eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *FetchAndOp = link->h_FetchAndMinloc; 791eb02082bSJunchao Zhang else if (op == MPI_PROD) *FetchAndOp = link->h_FetchAndMult; 792eb02082bSJunchao Zhang else if (op == MPI_LAND) *FetchAndOp = link->h_FetchAndLAND; 793eb02082bSJunchao Zhang else if (op == MPI_BAND) *FetchAndOp = link->h_FetchAndBAND; 794eb02082bSJunchao Zhang else if (op == MPI_LOR) *FetchAndOp = link->h_FetchAndLOR; 795eb02082bSJunchao Zhang else if (op == MPI_BOR) *FetchAndOp = link->h_FetchAndBOR; 796eb02082bSJunchao Zhang else if (op == MPI_LXOR) *FetchAndOp = link->h_FetchAndLXOR; 797eb02082bSJunchao Zhang else if (op == MPI_BXOR) *FetchAndOp = link->h_FetchAndBXOR; 798eb02082bSJunchao Zhang else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op"); 799eb02082bSJunchao Zhang } 800eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 801eb02082bSJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 802eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *FetchAndOp = link->d_FetchAndInsert; 803eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->d_FetchAndAdd; 804eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->d_FetchAndMax; 805eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->d_FetchAndMin; 806eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *FetchAndOp = link->d_FetchAndMaxloc; 807eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *FetchAndOp = link->d_FetchAndMinloc; 808eb02082bSJunchao Zhang else if (op == MPI_PROD) *FetchAndOp = link->d_FetchAndMult; 809eb02082bSJunchao Zhang else if (op == MPI_LAND) *FetchAndOp = link->d_FetchAndLAND; 810eb02082bSJunchao Zhang else if (op == MPI_BAND) *FetchAndOp = link->d_FetchAndBAND; 811eb02082bSJunchao Zhang else if (op == MPI_LOR) *FetchAndOp = link->d_FetchAndLOR; 812eb02082bSJunchao Zhang else if (op == MPI_BOR) *FetchAndOp = link->d_FetchAndBOR; 813eb02082bSJunchao Zhang else if (op == MPI_LXOR) *FetchAndOp = link->d_FetchAndLXOR; 814eb02082bSJunchao Zhang else if (op == MPI_BXOR) *FetchAndOp = link->d_FetchAndBXOR; 815eb02082bSJunchao Zhang else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op"); 816eb02082bSJunchao Zhang } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 817eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *FetchAndOp = link->da_FetchAndInsert; 818eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *FetchAndOp = link->da_FetchAndAdd; 819eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *FetchAndOp = link->da_FetchAndMax; 820eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *FetchAndOp = link->da_FetchAndMin; 821eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *FetchAndOp = link->da_FetchAndMaxloc; 822eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *FetchAndOp = link->da_FetchAndMinloc; 823eb02082bSJunchao Zhang else if (op == MPI_PROD) *FetchAndOp = link->da_FetchAndMult; 824eb02082bSJunchao Zhang else if (op == MPI_LAND) *FetchAndOp = link->da_FetchAndLAND; 825eb02082bSJunchao Zhang else if (op == MPI_BAND) *FetchAndOp = link->da_FetchAndBAND; 826eb02082bSJunchao Zhang else if (op == MPI_LOR) *FetchAndOp = link->da_FetchAndLOR; 827eb02082bSJunchao Zhang else if (op == MPI_BOR) *FetchAndOp = link->da_FetchAndBOR; 828eb02082bSJunchao Zhang else if (op == MPI_LXOR) *FetchAndOp = link->da_FetchAndLXOR; 829eb02082bSJunchao Zhang else if (op == MPI_BXOR) *FetchAndOp = link->da_FetchAndBXOR; 830eb02082bSJunchao Zhang else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op"); 831eb02082bSJunchao Zhang } 832eb02082bSJunchao Zhang #endif 833eb02082bSJunchao Zhang else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %D",mtype); 83440e23c03SJunchao Zhang PetscFunctionReturn(0); 83540e23c03SJunchao Zhang } 83640e23c03SJunchao Zhang 83740e23c03SJunchao Zhang /* 838eb02082bSJunchao Zhang Create pack/unpack optimization plans based on indice patterns available 83940e23c03SJunchao Zhang 84040e23c03SJunchao Zhang Input Parameters: 841b23bfdefSJunchao Zhang + n - Number of target ranks 842eb02082bSJunchao Zhang . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 843b23bfdefSJunchao Zhang - idx - [*] Array storing indices 84440e23c03SJunchao Zhang 84540e23c03SJunchao Zhang Output Parameters: 846eb02082bSJunchao Zhang + opt - Optimization plans. Maybe NULL if no optimization can be built. 84740e23c03SJunchao Zhang */ 848eb02082bSJunchao Zhang PetscErrorCode PetscSFPackOptCreate(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 84940e23c03SJunchao Zhang { 85040e23c03SJunchao Zhang PetscErrorCode ierr; 85140e23c03SJunchao Zhang PetscInt i,j,k,n_copies,tot_copies=0,step; 852b23bfdefSJunchao Zhang PetscBool strided,optimized=PETSC_FALSE; 85340e23c03SJunchao Zhang PetscSFPackOpt opt; 85440e23c03SJunchao Zhang 85540e23c03SJunchao Zhang PetscFunctionBegin; 856b23bfdefSJunchao Zhang if (!n) { 857b23bfdefSJunchao Zhang *out = NULL; 858b23bfdefSJunchao Zhang PetscFunctionReturn(0); 859b23bfdefSJunchao Zhang } 860b23bfdefSJunchao Zhang 86140e23c03SJunchao Zhang ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr); 862b23bfdefSJunchao Zhang ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr); 863b23bfdefSJunchao Zhang ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr); 864b23bfdefSJunchao Zhang if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} /* Zero-base offset[]. Note the packing routine is Pack(count, idx[], ...*/ 865b23bfdefSJunchao Zhang 866b23bfdefSJunchao Zhang opt->n = n; 86740e23c03SJunchao Zhang 86840e23c03SJunchao Zhang /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with mulitple memcpy's ) */ 86940e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each target processor */ 87040e23c03SJunchao Zhang /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */ 87140e23c03SJunchao Zhang n_copies = 1; 87240e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 87340e23c03SJunchao Zhang if (idx[j]+1 != idx[j+1]) n_copies++; 87440e23c03SJunchao Zhang } 87540e23c03SJunchao Zhang /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32, 87640e23c03SJunchao Zhang then it is worth using memcpy for this target. 32 is an arbitrarily chosen number. 87740e23c03SJunchao Zhang */ 87840e23c03SJunchao Zhang if ((offset[i+1]-offset[i])/n_copies >= 32) { 879b23bfdefSJunchao Zhang opt->type[i] = PETSCSF_PACKOPT_MULTICOPY; 880b23bfdefSJunchao Zhang optimized = PETSC_TRUE; 88140e23c03SJunchao Zhang tot_copies += n_copies; 88240e23c03SJunchao Zhang } 88340e23c03SJunchao Zhang } 88440e23c03SJunchao Zhang 88540e23c03SJunchao Zhang /* Setup memcpy plan for each contiguous piece */ 88640e23c03SJunchao Zhang k = 0; /* k-th copy */ 887b23bfdefSJunchao Zhang ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr); 888b23bfdefSJunchao Zhang for (i=0; i<n; i++) { /* for each target processor */ 889b23bfdefSJunchao Zhang if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) { 89040e23c03SJunchao Zhang n_copies = 1; 891b23bfdefSJunchao Zhang opt->copy_start[k] = offset[i] - offset[0]; 89240e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 89340e23c03SJunchao Zhang if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */ 89440e23c03SJunchao Zhang n_copies++; 895b23bfdefSJunchao Zhang opt->copy_start[k+1] = j-offset[0]+1; 896b23bfdefSJunchao Zhang opt->copy_length[k] = opt->copy_start[k+1] - opt->copy_start[k]; 89740e23c03SJunchao Zhang k++; 89840e23c03SJunchao Zhang } 89940e23c03SJunchao Zhang } 90040e23c03SJunchao Zhang /* Set copy length of the last copy for this target */ 901b23bfdefSJunchao Zhang opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k]; 90240e23c03SJunchao Zhang k++; 90340e23c03SJunchao Zhang } 904b23bfdefSJunchao Zhang /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */ 90540e23c03SJunchao Zhang opt->copy_offset[i+1] = k; 90640e23c03SJunchao Zhang } 90740e23c03SJunchao Zhang 90840e23c03SJunchao Zhang /* Last chance! If the indices do not have long contiguous pieces, are they strided? */ 90940e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each remote */ 910b23bfdefSJunchao Zhang if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */ 91140e23c03SJunchao Zhang strided = PETSC_TRUE; 91240e23c03SJunchao Zhang step = idx[offset[i]+1] - idx[offset[i]]; 91340e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 91440e23c03SJunchao Zhang if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; } 91540e23c03SJunchao Zhang } 91640e23c03SJunchao Zhang if (strided) { 917b23bfdefSJunchao Zhang opt->type[i] = PETSCSF_PACKOPT_STRIDE; 91840e23c03SJunchao Zhang opt->stride_step[i] = step; 91940e23c03SJunchao Zhang opt->stride_n[i] = offset[i+1] - offset[i]; 920b23bfdefSJunchao Zhang optimized = PETSC_TRUE; 92140e23c03SJunchao Zhang } 92240e23c03SJunchao Zhang } 92340e23c03SJunchao Zhang } 924b23bfdefSJunchao Zhang /* If no rank gets optimized, free arrays to save memory */ 925b23bfdefSJunchao Zhang if (!optimized) { 926b23bfdefSJunchao Zhang ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 927b23bfdefSJunchao Zhang ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 92840e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 92940e23c03SJunchao Zhang *out = NULL; 93040e23c03SJunchao Zhang } else *out = opt; 93140e23c03SJunchao Zhang PetscFunctionReturn(0); 93240e23c03SJunchao Zhang } 93340e23c03SJunchao Zhang 93464f49babSJed Brown PetscErrorCode PetscSFPackOptDestroy(PetscSFPackOpt *out) 93540e23c03SJunchao Zhang { 93640e23c03SJunchao Zhang PetscErrorCode ierr; 93740e23c03SJunchao Zhang PetscSFPackOpt opt = *out; 93840e23c03SJunchao Zhang 93940e23c03SJunchao Zhang PetscFunctionBegin; 94040e23c03SJunchao Zhang if (opt) { 941b23bfdefSJunchao Zhang ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 942b23bfdefSJunchao Zhang ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 94340e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 94440e23c03SJunchao Zhang *out = NULL; 94540e23c03SJunchao Zhang } 94640e23c03SJunchao Zhang PetscFunctionReturn(0); 94740e23c03SJunchao Zhang } 948