140e23c03SJunchao Zhang 240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 440e23c03SJunchao Zhang 5eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 6eb02082bSJunchao Zhang #include <cuda_runtime.h> 7eb02082bSJunchao Zhang #endif 840e23c03SJunchao Zhang /* 940e23c03SJunchao Zhang * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 1040e23c03SJunchao Zhang * therefore we pack data types manually. This file defines packing routines for the standard data types. 1140e23c03SJunchao Zhang */ 1240e23c03SJunchao Zhang 13*cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 1440e23c03SJunchao Zhang 15*cd620004SJunchao Zhang /* Operations working like s += t */ 16*cd620004SJunchao Zhang #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 17*cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 18*cd620004SJunchao Zhang #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */ 19*cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t) do {(s) = (t);} while(0) 20*cd620004SJunchao Zhang /* Ref MPI MAXLOC */ 21*cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \ 22*cd620004SJunchao Zhang do { \ 23*cd620004SJunchao Zhang if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 24*cd620004SJunchao Zhang else if (!((s).u op (t).u)) s = t; \ 25*cd620004SJunchao Zhang } while(0) 2640e23c03SJunchao Zhang 2740e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine 2840e23c03SJunchao Zhang 2940e23c03SJunchao Zhang Arguments of the macro: 30b23bfdefSJunchao Zhang +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 31b23bfdefSJunchao Zhang .BS Block size for vectorization. It is a factor of bs. 32b23bfdefSJunchao Zhang -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 3340e23c03SJunchao Zhang 3440e23c03SJunchao Zhang Arguments of the Pack routine: 35*cd620004SJunchao Zhang +count Number of indices in idx[]. 36*cd620004SJunchao Zhang .start If indices are contiguous, it is the first index; otherwise, not used. 37*cd620004SJunchao Zhang .idx Indices of entries to packed. NULL means contiguous indices, that is [start,start+count). 38eb02082bSJunchao Zhang .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39*cd620004SJunchao Zhang .opt Per-remote pack optimization plan. NULL means no such plan. 40*cd620004SJunchao Zhang .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 41*cd620004SJunchao Zhang -packed Address of the packed data. 4240e23c03SJunchao Zhang */ 43b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \ 44*cd620004SJunchao Zhang static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,const void *unpacked,void *packed) \ 45b23bfdefSJunchao Zhang { \ 4640e23c03SJunchao Zhang PetscErrorCode ierr; \ 47b23bfdefSJunchao Zhang const Type *u = (const Type*)unpacked,*u2; \ 48b23bfdefSJunchao Zhang Type *p = (Type*)packed,*p2; \ 49eb02082bSJunchao Zhang PetscInt i,j,k,l,r,step,bs=link->bs; \ 50b23bfdefSJunchao Zhang const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 51b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 5240e23c03SJunchao Zhang PetscFunctionBegin; \ 53*cd620004SJunchao Zhang if (!idx) {ierr = PetscArraycpy(p,u+start*bs,MBS*count);CHKERRQ(ierr);} /* Indices are contiguous */ \ 54b23bfdefSJunchao Zhang else if (!opt) { /* No optimizations available */ \ 55b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 56eb02082bSJunchao Zhang for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 57eb02082bSJunchao Zhang for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 58b23bfdefSJunchao Zhang p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 59b23bfdefSJunchao Zhang } else { \ 60b23bfdefSJunchao Zhang for (r=0; r<opt->n; r++) { \ 61b23bfdefSJunchao Zhang p2 = p + opt->offset[r]*MBS; \ 62b23bfdefSJunchao Zhang if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 63b23bfdefSJunchao Zhang idx2 = idx + opt->offset[r]; \ 64b23bfdefSJunchao Zhang for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 65b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 66b23bfdefSJunchao Zhang for (k=0; k<BS; k++) \ 67b23bfdefSJunchao Zhang p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k]; \ 68b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 6940e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \ 70b23bfdefSJunchao Zhang u2 = u + idx[opt->copy_start[i]]*MBS; \ 71b23bfdefSJunchao Zhang l = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */ \ 72b23bfdefSJunchao Zhang ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr); \ 73b23bfdefSJunchao Zhang p2 += l; \ 7440e23c03SJunchao Zhang } \ 75b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 76b23bfdefSJunchao Zhang u2 = u + idx[opt->offset[r]]*MBS; \ 7740e23c03SJunchao Zhang step = opt->stride_step[r]; \ 7840e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 79b23bfdefSJunchao Zhang for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j]; \ 80b23bfdefSJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 8140e23c03SJunchao Zhang } \ 8240e23c03SJunchao Zhang } \ 8340e23c03SJunchao Zhang PetscFunctionReturn(0); \ 8440e23c03SJunchao Zhang } 8540e23c03SJunchao Zhang 86*cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 87*cd620004SJunchao Zhang and inserts into a sparse array. 8840e23c03SJunchao Zhang 8940e23c03SJunchao Zhang Arguments: 90b23bfdefSJunchao Zhang .Type Type of the data 9140e23c03SJunchao Zhang .BS Block size for vectorization 92b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 9340e23c03SJunchao Zhang 9440e23c03SJunchao Zhang Notes: 9540e23c03SJunchao Zhang This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 9640e23c03SJunchao Zhang */ 97*cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ) \ 98*cd620004SJunchao Zhang static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \ 99b23bfdefSJunchao Zhang { \ 10040e23c03SJunchao Zhang PetscErrorCode ierr; \ 101b23bfdefSJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 102*cd620004SJunchao Zhang const Type *p = (const Type*)packed,*p2; \ 103eb02082bSJunchao Zhang PetscInt i,j,k,l,r,step,bs=link->bs; \ 104b23bfdefSJunchao Zhang const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 105b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 10640e23c03SJunchao Zhang PetscFunctionBegin; \ 107b23bfdefSJunchao Zhang if (!idx) { \ 108*cd620004SJunchao Zhang u2 = u + start*bs; \ 109*cd620004SJunchao Zhang if (u2 != p) {ierr = PetscArraycpy(u2,p,count*MBS);CHKERRQ(ierr);} \ 110b23bfdefSJunchao Zhang } else if (!opt) { /* No optimizations available */ \ 111b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 112b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 113*cd620004SJunchao Zhang for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 114b23bfdefSJunchao Zhang } else { \ 115b23bfdefSJunchao Zhang for (r=0; r<opt->n; r++) { \ 116b23bfdefSJunchao Zhang p2 = p + opt->offset[r]*MBS; \ 117b23bfdefSJunchao Zhang if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 118b23bfdefSJunchao Zhang idx2 = idx + opt->offset[r]; \ 119b23bfdefSJunchao Zhang for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 120b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 121*cd620004SJunchao Zhang for (k=0; k<BS; k++) u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 122b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 12340e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 124b23bfdefSJunchao Zhang u2 = u + idx[opt->copy_start[i]]*MBS; \ 125b23bfdefSJunchao Zhang l = opt->copy_length[i]*MBS; \ 126b23bfdefSJunchao Zhang ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr); \ 127b23bfdefSJunchao Zhang p2 += l; \ 12840e23c03SJunchao Zhang } \ 129b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 130b23bfdefSJunchao Zhang u2 = u + idx[opt->offset[r]]*MBS; \ 13140e23c03SJunchao Zhang step = opt->stride_step[r]; \ 13240e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 133b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 134*cd620004SJunchao Zhang for (k=0; k<BS; k++) u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 135b23bfdefSJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 13640e23c03SJunchao Zhang } \ 13740e23c03SJunchao Zhang } \ 13840e23c03SJunchao Zhang PetscFunctionReturn(0); \ 13940e23c03SJunchao Zhang } 14040e23c03SJunchao Zhang 141*cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 14240e23c03SJunchao Zhang 14340e23c03SJunchao Zhang Arguments: 144*cd620004SJunchao Zhang +Opname Name of the Op, such as Add, Mult, LAND, etc. 145b23bfdefSJunchao Zhang .Type Type of the data 14640e23c03SJunchao Zhang .BS Block size for vectorization 147b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 148*cd620004SJunchao Zhang .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 149*cd620004SJunchao Zhang .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 15040e23c03SJunchao Zhang */ 151*cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 152*cd620004SJunchao Zhang static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \ 153b23bfdefSJunchao Zhang { \ 154*cd620004SJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 155*cd620004SJunchao Zhang const Type *p = (const Type*)packed,*p2; \ 156eb02082bSJunchao Zhang PetscInt i,j,k,l,r,step,bs=link->bs; \ 157b23bfdefSJunchao Zhang const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 158b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 15940e23c03SJunchao Zhang PetscFunctionBegin; \ 160b23bfdefSJunchao Zhang if (!idx) { \ 161*cd620004SJunchao Zhang u += start*bs; \ 162*cd620004SJunchao Zhang for (i=0; i<count; i++) \ 163*cd620004SJunchao Zhang for (j=0; j<M; j++) \ 164*cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 165*cd620004SJunchao Zhang OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 166*cd620004SJunchao Zhang } else if (!opt) { /* No optimizations available */ \ 167*cd620004SJunchao Zhang for (i=0; i<count; i++) \ 168*cd620004SJunchao Zhang for (j=0; j<M; j++) \ 169*cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 170*cd620004SJunchao Zhang OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 171*cd620004SJunchao Zhang } else { \ 172*cd620004SJunchao Zhang for (r=0; r<opt->n; r++) { \ 173*cd620004SJunchao Zhang p2 = p + opt->offset[r]*MBS; \ 174*cd620004SJunchao Zhang if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 175*cd620004SJunchao Zhang idx2 = idx + opt->offset[r]; \ 176*cd620004SJunchao Zhang for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 177*cd620004SJunchao Zhang for (j=0; j<M; j++) \ 178*cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 179*cd620004SJunchao Zhang OpApply(Op,u[idx2[i]*MBS+j*BS+k],p2[i*MBS+j*BS+k]); \ 180*cd620004SJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 181*cd620004SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 182*cd620004SJunchao Zhang u2 = u + idx[opt->copy_start[i]]*MBS; \ 183*cd620004SJunchao Zhang l = opt->copy_length[i]*MBS; \ 184*cd620004SJunchao Zhang for (j=0; j<l; j++) OpApply(Op,u2[j],p2[j]); \ 185*cd620004SJunchao Zhang p2 += l; \ 186*cd620004SJunchao Zhang } \ 187*cd620004SJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 188*cd620004SJunchao Zhang u2 = u + idx[opt->offset[r]]*MBS; \ 189*cd620004SJunchao Zhang step = opt->stride_step[r]; \ 190*cd620004SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 191*cd620004SJunchao Zhang for (j=0; j<M; j++) \ 192*cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 193*cd620004SJunchao Zhang OpApply(Op,u2[i*step*MBS+j*BS+k],p2[i*MBS+j*BS+k]); \ 194*cd620004SJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 195*cd620004SJunchao Zhang } \ 196*cd620004SJunchao Zhang } \ 197*cd620004SJunchao Zhang PetscFunctionReturn(0); \ 198*cd620004SJunchao Zhang } 199*cd620004SJunchao Zhang 200*cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 201*cd620004SJunchao Zhang static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,void *packed) \ 202*cd620004SJunchao Zhang { \ 203*cd620004SJunchao Zhang Type *u = (Type*)unpacked,*p = (Type*)packed,t; \ 204*cd620004SJunchao Zhang PetscInt i,j,k,bs=link->bs; \ 205*cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 206*cd620004SJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 207*cd620004SJunchao Zhang PetscFunctionBegin; \ 208*cd620004SJunchao Zhang if (!idx) { \ 209*cd620004SJunchao Zhang u += start*bs; \ 210b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 211b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 212b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 213b23bfdefSJunchao Zhang t = u[i*MBS+j*BS+k]; \ 214*cd620004SJunchao Zhang OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 215*cd620004SJunchao Zhang p[i*MBS+j*BS+k] = t; \ 21640e23c03SJunchao Zhang } \ 217*cd620004SJunchao Zhang } else { \ 218b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 219b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 220b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 221b23bfdefSJunchao Zhang t = u[idx[i]*MBS+j*BS+k]; \ 222*cd620004SJunchao Zhang OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 223*cd620004SJunchao Zhang p[i*MBS+j*BS+k] = t; \ 224*cd620004SJunchao Zhang } \ 225*cd620004SJunchao Zhang } \ 226*cd620004SJunchao Zhang PetscFunctionReturn(0); \ 227*cd620004SJunchao Zhang } 228*cd620004SJunchao Zhang 229*cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 230*cd620004SJunchao Zhang static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt startx,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata) \ 231*cd620004SJunchao Zhang { \ 232*cd620004SJunchao Zhang const Type *x = (const Type*)xdata; \ 233*cd620004SJunchao Zhang Type *y = (Type*)ydata; \ 234*cd620004SJunchao Zhang PetscInt i,j,k,bs = link->bs; \ 235*cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 236*cd620004SJunchao Zhang const PetscInt MBS = M*BS; \ 237*cd620004SJunchao Zhang PetscFunctionBegin; \ 238*cd620004SJunchao Zhang if (!idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use UnpackAndOp instead of ScatterAndOp");\ 239*cd620004SJunchao Zhang if (!idy) { \ 240*cd620004SJunchao Zhang y += starty*bs; \ 241*cd620004SJunchao Zhang for (i=0; i<count; i++) \ 242*cd620004SJunchao Zhang for (j=0; j<M; j++) \ 243*cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 244*cd620004SJunchao Zhang OpApply(Op,y[i*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]); \ 245*cd620004SJunchao Zhang } else { \ 246*cd620004SJunchao Zhang for (i=0; i<count; i++) \ 247*cd620004SJunchao Zhang for (j=0; j<M; j++) \ 248*cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 249*cd620004SJunchao Zhang OpApply(Op,y[idy[i]*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]); \ 250*cd620004SJunchao Zhang } \ 251*cd620004SJunchao Zhang PetscFunctionReturn(0); \ 252*cd620004SJunchao Zhang } 253*cd620004SJunchao Zhang 254*cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 255*cd620004SJunchao Zhang static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,const PetscInt *rootindices,void *rootdata,PetscInt leafstart,const PetscInt *leafindices,const void *leafdata,void *leafupdate) \ 256*cd620004SJunchao Zhang { \ 257*cd620004SJunchao Zhang Type *x = (Type*)rootdata,*y2 = (Type*)leafupdate; \ 258*cd620004SJunchao Zhang const Type *y = (const Type*)leafdata; \ 259*cd620004SJunchao Zhang PetscInt i,j,k,bs = link->bs; \ 260*cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 261*cd620004SJunchao Zhang const PetscInt MBS = M*BS; \ 262*cd620004SJunchao Zhang PetscFunctionBegin; \ 263*cd620004SJunchao Zhang if (!rootindices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use FetchAndOp instead of FetchAndOpLocal");\ 264*cd620004SJunchao Zhang if (!leafindices) { \ 265*cd620004SJunchao Zhang y += leafstart*bs; y2 += leafstart*bs; \ 266*cd620004SJunchao Zhang for (i=0; i<count; i++) \ 267*cd620004SJunchao Zhang for (j=0; j<M; j++) \ 268*cd620004SJunchao Zhang for (k=0; k<BS; k++) { \ 269*cd620004SJunchao Zhang y2[i*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k]; \ 270*cd620004SJunchao Zhang OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[i*MBS+j*BS+k]); \ 27140e23c03SJunchao Zhang } \ 272b23bfdefSJunchao Zhang } else { \ 273*cd620004SJunchao Zhang for (i=0; i<count; i++) \ 274b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 275b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 276*cd620004SJunchao Zhang y2[leafindices[i]*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k]; \ 277*cd620004SJunchao Zhang OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[leafindices[i]*MBS+j*BS+k]); \ 27840e23c03SJunchao Zhang } \ 27940e23c03SJunchao Zhang } \ 28040e23c03SJunchao Zhang PetscFunctionReturn(0); \ 28140e23c03SJunchao Zhang } 28240e23c03SJunchao Zhang 28340e23c03SJunchao Zhang 284b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */ 285b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ) \ 286b23bfdefSJunchao Zhang DEF_PackFunc(Type,BS,EQ) \ 287*cd620004SJunchao Zhang DEF_UnpackFunc(Type,BS,EQ) \ 288*cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 289*cd620004SJunchao Zhang static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 290eb02082bSJunchao Zhang link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 291eb02082bSJunchao Zhang link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 292*cd620004SJunchao Zhang link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 29340e23c03SJunchao Zhang } 29440e23c03SJunchao Zhang 295b23bfdefSJunchao Zhang /* Add, Mult ops */ 296b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ) \ 297*cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 298*cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 299*cd620004SJunchao Zhang DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 300*cd620004SJunchao Zhang DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 301*cd620004SJunchao Zhang DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 302*cd620004SJunchao Zhang DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 303*cd620004SJunchao Zhang static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 304eb02082bSJunchao Zhang link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 305eb02082bSJunchao Zhang link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 306eb02082bSJunchao Zhang link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 307*cd620004SJunchao Zhang link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 308*cd620004SJunchao Zhang link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 309*cd620004SJunchao Zhang link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 31040e23c03SJunchao Zhang } 31140e23c03SJunchao Zhang 312b23bfdefSJunchao Zhang /* Max, Min ops */ 313b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ) \ 314*cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 315*cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 316*cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 317*cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 318*cd620004SJunchao Zhang static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 319eb02082bSJunchao Zhang link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 320eb02082bSJunchao Zhang link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 321*cd620004SJunchao Zhang link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 322*cd620004SJunchao Zhang link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 323b23bfdefSJunchao Zhang } 324b23bfdefSJunchao Zhang 325b23bfdefSJunchao Zhang /* Logical ops. 326*cd620004SJunchao Zhang The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 32740e23c03SJunchao Zhang the compilation warning "empty macro arguments are undefined in ISO C90" 32840e23c03SJunchao Zhang */ 329b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ) \ 330*cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 331*cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 332*cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 333*cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 334*cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 335*cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 336*cd620004SJunchao Zhang static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 337eb02082bSJunchao Zhang link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 338eb02082bSJunchao Zhang link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 339eb02082bSJunchao Zhang link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 340*cd620004SJunchao Zhang link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 341*cd620004SJunchao Zhang link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 342*cd620004SJunchao Zhang link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 34340e23c03SJunchao Zhang } 34440e23c03SJunchao Zhang 345b23bfdefSJunchao Zhang /* Bitwise ops */ 346b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ) \ 347*cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 348*cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 349*cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 350*cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 351*cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 352*cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 353*cd620004SJunchao Zhang static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 354eb02082bSJunchao Zhang link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 355eb02082bSJunchao Zhang link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 356eb02082bSJunchao Zhang link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 357*cd620004SJunchao Zhang link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 358*cd620004SJunchao Zhang link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 359*cd620004SJunchao Zhang link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 36040e23c03SJunchao Zhang } 36140e23c03SJunchao Zhang 362*cd620004SJunchao Zhang /* Maxloc, Minloc ops */ 363*cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ) \ 364*cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 365*cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 366*cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 367*cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 368*cd620004SJunchao Zhang static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 369*cd620004SJunchao Zhang link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 370*cd620004SJunchao Zhang link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 371*cd620004SJunchao Zhang link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 372*cd620004SJunchao Zhang link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 37340e23c03SJunchao Zhang } 37440e23c03SJunchao Zhang 375b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ) \ 376b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 377b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 378b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 379b23bfdefSJunchao Zhang DEF_Log(Type,BS,EQ) \ 380b23bfdefSJunchao Zhang DEF_Bit(Type,BS,EQ) \ 381*cd620004SJunchao Zhang static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 382b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 383b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 384b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 385b23bfdefSJunchao Zhang CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 386b23bfdefSJunchao Zhang CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 38740e23c03SJunchao Zhang } 38840e23c03SJunchao Zhang 389b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ) \ 390b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 391b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 392b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 393*cd620004SJunchao Zhang static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 394b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 395b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 396b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 397b23bfdefSJunchao Zhang } 39840e23c03SJunchao Zhang 39940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 400b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ) \ 401b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 402b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 403*cd620004SJunchao Zhang static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 404b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 405b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 406b23bfdefSJunchao Zhang } 40740e23c03SJunchao Zhang #endif 408b23bfdefSJunchao Zhang 409b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ) \ 410b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 411*cd620004SJunchao Zhang static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 412b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 413b23bfdefSJunchao Zhang } 414b23bfdefSJunchao Zhang 415b23bfdefSJunchao Zhang /* Maxloc, Minloc */ 416*cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ) \ 417*cd620004SJunchao Zhang DEF_Pack(Type,BS,EQ) \ 418*cd620004SJunchao Zhang DEF_Xloc(Type,BS,EQ) \ 419*cd620004SJunchao Zhang static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 420*cd620004SJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 421*cd620004SJunchao Zhang CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 422b23bfdefSJunchao Zhang } 423b23bfdefSJunchao Zhang 424b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 425b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 426b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 427b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 428b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 429b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 430b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 431b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 432b23bfdefSJunchao Zhang 433b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 434b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1) 435b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1) 436b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1) 437b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1) 438b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0) 439b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0) 440b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0) 441b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0) 442b23bfdefSJunchao Zhang #endif 443b23bfdefSJunchao Zhang 444b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */ 445b23bfdefSJunchao Zhang typedef signed char SignedChar; 446b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1) 447b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1) 448b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1) 449b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1) 450b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0) 451b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0) 452b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0) 453b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0) 454b23bfdefSJunchao Zhang 455b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar; 456b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1) 457b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1) 458b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1) 459b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1) 460b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0) 461b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0) 462b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0) 463b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0) 464b23bfdefSJunchao Zhang 465b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1) 466b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1) 467b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1) 468b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1) 469b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0) 470b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0) 471b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0) 472b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0) 473b23bfdefSJunchao Zhang 474b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 475b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1) 476b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1) 477b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1) 478b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1) 479b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0) 480b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0) 481b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0) 482b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0) 483b23bfdefSJunchao Zhang #endif 484b23bfdefSJunchao Zhang 485*cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2 486*cd620004SJunchao Zhang typedef struct {int u; int i;} PairType(int,int); 487*cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 488*cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1) 489*cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1) 490b23bfdefSJunchao Zhang 491b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */ 492b23bfdefSJunchao Zhang DEF_DumbType(char,1,1) 493b23bfdefSJunchao Zhang DEF_DumbType(char,2,1) 494b23bfdefSJunchao Zhang DEF_DumbType(char,4,1) 495b23bfdefSJunchao Zhang DEF_DumbType(char,1,0) 496b23bfdefSJunchao Zhang DEF_DumbType(char,2,0) 497b23bfdefSJunchao Zhang DEF_DumbType(char,4,0) 498b23bfdefSJunchao Zhang 499eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 500b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1) 501b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1) 502b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1) 503b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1) 504b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0) 505b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0) 506b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0) 507b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0) 50840e23c03SJunchao Zhang 50940e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 51040e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 51140e23c03SJunchao Zhang { 51240e23c03SJunchao Zhang int ierr; 51340e23c03SJunchao Zhang ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 51440e23c03SJunchao Zhang ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 51540e23c03SJunchao Zhang return MPI_SUCCESS; 51640e23c03SJunchao Zhang } 51740e23c03SJunchao Zhang #endif 51840e23c03SJunchao Zhang 519*cd620004SJunchao Zhang /* 520*cd620004SJunchao Zhang The routine Creates a communication link for the given operation. It first looks up its link cache. If 521*cd620004SJunchao Zhang there is a free & suitable one, it uses it. Otherwise it creates a new one. 522*cd620004SJunchao Zhang 523*cd620004SJunchao Zhang A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 524*cd620004SJunchao Zhang root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 525*cd620004SJunchao Zhang can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 526*cd620004SJunchao Zhang those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 527*cd620004SJunchao Zhang 528*cd620004SJunchao Zhang The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 529*cd620004SJunchao Zhang 530*cd620004SJunchao Zhang In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 531*cd620004SJunchao Zhang 532*cd620004SJunchao Zhang The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 533*cd620004SJunchao Zhang need pack/unpack data. 534*cd620004SJunchao Zhang */ 535*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 53640e23c03SJunchao Zhang { 53740e23c03SJunchao Zhang PetscErrorCode ierr; 538*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 539*cd620004SJunchao Zhang PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 540*cd620004SJunchao Zhang PetscSFLink *p,link; 541*cd620004SJunchao Zhang PetscSFDirection direction; 542*cd620004SJunchao Zhang MPI_Request *reqs = NULL; 543*cd620004SJunchao Zhang PetscBool match,rootdirect[2],leafdirect[2]; 544*cd620004SJunchao Zhang PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 545*cd620004SJunchao Zhang PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 546*cd620004SJunchao Zhang 547*cd620004SJunchao Zhang PetscFunctionBegin; 548*cd620004SJunchao Zhang ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 549*cd620004SJunchao Zhang 550*cd620004SJunchao Zhang /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 551*cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 552*cd620004SJunchao Zhang if (sfop == PETSCSF_BCAST) { 553*cd620004SJunchao Zhang rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 554*cd620004SJunchao Zhang leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 555*cd620004SJunchao Zhang } else if (sfop == PETSCSF_REDUCE) { 556*cd620004SJunchao Zhang leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 557*cd620004SJunchao Zhang rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 558*cd620004SJunchao Zhang } else { /* PETSCSF_FETCH */ 559*cd620004SJunchao Zhang rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 560*cd620004SJunchao Zhang leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 561*cd620004SJunchao Zhang } 562*cd620004SJunchao Zhang } 563*cd620004SJunchao Zhang 564*cd620004SJunchao Zhang if (use_gpu_aware_mpi) { 565*cd620004SJunchao Zhang rootmtype_mpi = rootmtype; 566*cd620004SJunchao Zhang leafmtype_mpi = leafmtype; 567*cd620004SJunchao Zhang } else { 568*cd620004SJunchao Zhang rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 569*cd620004SJunchao Zhang } 570*cd620004SJunchao Zhang /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 571*cd620004SJunchao Zhang rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 572*cd620004SJunchao Zhang leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 573*cd620004SJunchao Zhang 574*cd620004SJunchao Zhang direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 575*cd620004SJunchao Zhang nrootreqs = bas->nrootreqs; 576*cd620004SJunchao Zhang nleafreqs = sf->nleafreqs; 577*cd620004SJunchao Zhang 578*cd620004SJunchao Zhang /* Look for free links in cache */ 579*cd620004SJunchao Zhang for (p=&bas->avail; (link=*p); p=&link->next) { 580*cd620004SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 581*cd620004SJunchao Zhang if (match) { 582*cd620004SJunchao Zhang /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. 583*cd620004SJunchao Zhang If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 584*cd620004SJunchao Zhang */ 585*cd620004SJunchao Zhang if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 586*cd620004SJunchao Zhang reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 587*cd620004SJunchao Zhang for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 588*cd620004SJunchao Zhang link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 589*cd620004SJunchao Zhang } 590*cd620004SJunchao Zhang if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 591*cd620004SJunchao Zhang reqs = link->leafreqs[direction][leafmtype][1]; 592*cd620004SJunchao Zhang for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 593*cd620004SJunchao Zhang link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 594*cd620004SJunchao Zhang } 595*cd620004SJunchao Zhang *p = link->next; /* Remove from available list */ 596*cd620004SJunchao Zhang goto found; 597*cd620004SJunchao Zhang } 598*cd620004SJunchao Zhang } 599*cd620004SJunchao Zhang 600*cd620004SJunchao Zhang ierr = PetscNew(&link);CHKERRQ(ierr); 601*cd620004SJunchao Zhang ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 602*cd620004SJunchao Zhang ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 603*cd620004SJunchao Zhang 604*cd620004SJunchao Zhang nreqs = (nrootreqs+nleafreqs)*8; 605*cd620004SJunchao Zhang ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 606*cd620004SJunchao Zhang for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 607*cd620004SJunchao Zhang 608*cd620004SJunchao Zhang for (i=0; i<2; i++) { /* Two communication directions */ 609*cd620004SJunchao Zhang for (j=0; j<2; j++) { /* Two memory types */ 610*cd620004SJunchao Zhang for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 611*cd620004SJunchao Zhang link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 612*cd620004SJunchao Zhang link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 613*cd620004SJunchao Zhang } 614*cd620004SJunchao Zhang } 615*cd620004SJunchao Zhang } 616*cd620004SJunchao Zhang 617*cd620004SJunchao Zhang found: 618*cd620004SJunchao Zhang if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);} 619*cd620004SJunchao Zhang 620*cd620004SJunchao Zhang /* Allocate buffers along root/leafdata */ 621*cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 622*cd620004SJunchao Zhang /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 623*cd620004SJunchao Zhang if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 624*cd620004SJunchao Zhang if (bas->rootbuflen[i]) { 625*cd620004SJunchao Zhang if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 626*cd620004SJunchao Zhang link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 627*cd620004SJunchao Zhang } else { /* Have to have a separate rootbuf */ 628*cd620004SJunchao Zhang if (!link->rootbuf_alloc[i][rootmtype]) { 629*cd620004SJunchao Zhang ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 630*cd620004SJunchao Zhang } 631*cd620004SJunchao Zhang link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 632*cd620004SJunchao Zhang } 633*cd620004SJunchao Zhang } 634*cd620004SJunchao Zhang 635*cd620004SJunchao Zhang if (sf->leafbuflen[i]) { 636*cd620004SJunchao Zhang if (leafdirect[i]) { 637*cd620004SJunchao Zhang link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 638*cd620004SJunchao Zhang } else { 639*cd620004SJunchao Zhang if (!link->leafbuf_alloc[i][leafmtype]) { 640*cd620004SJunchao Zhang ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 641*cd620004SJunchao Zhang } 642*cd620004SJunchao Zhang link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 643*cd620004SJunchao Zhang } 644*cd620004SJunchao Zhang } 645*cd620004SJunchao Zhang } 646*cd620004SJunchao Zhang 647*cd620004SJunchao Zhang /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 648*cd620004SJunchao Zhang if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { 649*cd620004SJunchao Zhang if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 650*cd620004SJunchao Zhang ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 651*cd620004SJunchao Zhang } 652*cd620004SJunchao Zhang link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 653*cd620004SJunchao Zhang } 654*cd620004SJunchao Zhang if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { 655*cd620004SJunchao Zhang if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 656*cd620004SJunchao Zhang ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 657*cd620004SJunchao Zhang } 658*cd620004SJunchao Zhang link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 659*cd620004SJunchao Zhang } 660*cd620004SJunchao Zhang 661*cd620004SJunchao Zhang /* Set `current` state of the link. They may change between different SF invocations with the same link */ 662*cd620004SJunchao Zhang if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 663*cd620004SJunchao Zhang if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 664*cd620004SJunchao Zhang if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 665*cd620004SJunchao Zhang } 666*cd620004SJunchao Zhang 667*cd620004SJunchao Zhang link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 668*cd620004SJunchao Zhang link->leafdata = leafdata; 669*cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 670*cd620004SJunchao Zhang link->rootdirect[i] = rootdirect[i]; 671*cd620004SJunchao Zhang link->leafdirect[i] = leafdirect[i]; 672*cd620004SJunchao Zhang } 673*cd620004SJunchao Zhang link->rootdirect_mpi = rootdirect_mpi; 674*cd620004SJunchao Zhang link->leafdirect_mpi = leafdirect_mpi; 675*cd620004SJunchao Zhang link->rootmtype = rootmtype; 676*cd620004SJunchao Zhang link->leafmtype = leafmtype; 677*cd620004SJunchao Zhang link->rootmtype_mpi = rootmtype_mpi; 678*cd620004SJunchao Zhang link->leafmtype_mpi = leafmtype_mpi; 679*cd620004SJunchao Zhang 680*cd620004SJunchao Zhang link->next = bas->inuse; 681*cd620004SJunchao Zhang bas->inuse = link; 682*cd620004SJunchao Zhang *mylink = link; 683*cd620004SJunchao Zhang PetscFunctionReturn(0); 684*cd620004SJunchao Zhang } 685*cd620004SJunchao Zhang 686*cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 687*cd620004SJunchao Zhang If the sf uses persistent requests and the requests have not been initialized, then initialize them. 688*cd620004SJunchao Zhang */ 689*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 690*cd620004SJunchao Zhang { 691*cd620004SJunchao Zhang PetscErrorCode ierr; 692*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 693*cd620004SJunchao Zhang PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 694*cd620004SJunchao Zhang const PetscInt *rootoffset,*leafoffset; 695*cd620004SJunchao Zhang PetscMPIInt n; 696*cd620004SJunchao Zhang MPI_Aint disp; 697*cd620004SJunchao Zhang MPI_Comm comm = PetscObjectComm((PetscObject)sf); 698*cd620004SJunchao Zhang MPI_Datatype unit = link->unit; 699*cd620004SJunchao Zhang const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 700*cd620004SJunchao Zhang const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 701*cd620004SJunchao Zhang 702*cd620004SJunchao Zhang PetscFunctionBegin; 703*cd620004SJunchao Zhang /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 704*cd620004SJunchao Zhang if (sf->persistent) { 705*cd620004SJunchao Zhang if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 706*cd620004SJunchao Zhang ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 707*cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 708*cd620004SJunchao Zhang for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 709*cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 710*cd620004SJunchao Zhang ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 711*cd620004SJunchao Zhang ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 712*cd620004SJunchao Zhang } 713*cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 714*cd620004SJunchao Zhang for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 715*cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 716*cd620004SJunchao Zhang ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 717*cd620004SJunchao Zhang ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 718*cd620004SJunchao Zhang } 719*cd620004SJunchao Zhang } 720*cd620004SJunchao Zhang link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 721*cd620004SJunchao Zhang } 722*cd620004SJunchao Zhang 723*cd620004SJunchao Zhang if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 724*cd620004SJunchao Zhang ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 725*cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 726*cd620004SJunchao Zhang for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 727*cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 728*cd620004SJunchao Zhang ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 729*cd620004SJunchao Zhang ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 730*cd620004SJunchao Zhang } 731*cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 732*cd620004SJunchao Zhang for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 733*cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 734*cd620004SJunchao Zhang ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 735*cd620004SJunchao Zhang ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 736*cd620004SJunchao Zhang } 737*cd620004SJunchao Zhang } 738*cd620004SJunchao Zhang link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 739*cd620004SJunchao Zhang } 740*cd620004SJunchao Zhang } 741*cd620004SJunchao Zhang if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 742*cd620004SJunchao Zhang if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 743*cd620004SJunchao Zhang if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 744*cd620004SJunchao Zhang if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 745*cd620004SJunchao Zhang PetscFunctionReturn(0); 746*cd620004SJunchao Zhang } 747*cd620004SJunchao Zhang 748*cd620004SJunchao Zhang 749*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 750*cd620004SJunchao Zhang { 751*cd620004SJunchao Zhang PetscErrorCode ierr; 752*cd620004SJunchao Zhang PetscSFLink link,*p; 75340e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 75440e23c03SJunchao Zhang 75540e23c03SJunchao Zhang PetscFunctionBegin; 75640e23c03SJunchao Zhang /* Look for types in cache */ 75740e23c03SJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 75840e23c03SJunchao Zhang PetscBool match; 75940e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 760637e6665SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 76140e23c03SJunchao Zhang switch (cmode) { 76240e23c03SJunchao Zhang case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 76340e23c03SJunchao Zhang case PETSC_USE_POINTER: break; 76440e23c03SJunchao Zhang default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 76540e23c03SJunchao Zhang } 76640e23c03SJunchao Zhang *mylink = link; 76740e23c03SJunchao Zhang PetscFunctionReturn(0); 76840e23c03SJunchao Zhang } 76940e23c03SJunchao Zhang } 77040e23c03SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 77140e23c03SJunchao Zhang PetscFunctionReturn(0); 77240e23c03SJunchao Zhang } 77340e23c03SJunchao Zhang 774*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) 77540e23c03SJunchao Zhang { 77640e23c03SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 77740e23c03SJunchao Zhang 77840e23c03SJunchao Zhang PetscFunctionBegin; 779637e6665SJunchao Zhang (*link)->rootdata = NULL; 780637e6665SJunchao Zhang (*link)->leafdata = NULL; 78140e23c03SJunchao Zhang (*link)->next = bas->avail; 78240e23c03SJunchao Zhang bas->avail = *link; 78340e23c03SJunchao Zhang *link = NULL; 78440e23c03SJunchao Zhang PetscFunctionReturn(0); 78540e23c03SJunchao Zhang } 78640e23c03SJunchao Zhang 787*cd620004SJunchao Zhang /* Destroy all links chained in 'avail' */ 788*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) 789eb02082bSJunchao Zhang { 790eb02082bSJunchao Zhang PetscErrorCode ierr; 791*cd620004SJunchao Zhang PetscSFLink link = *avail,next; 792*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 793*cd620004SJunchao Zhang PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 794eb02082bSJunchao Zhang 795eb02082bSJunchao Zhang PetscFunctionBegin; 796eb02082bSJunchao Zhang for (; link; link=next) { 797eb02082bSJunchao Zhang next = link->next; 798eb02082bSJunchao Zhang if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 799*cd620004SJunchao Zhang for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 800eb02082bSJunchao Zhang if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 801eb02082bSJunchao Zhang } 802eb02082bSJunchao Zhang ierr = PetscFree(link->reqs);CHKERRQ(ierr); 803*cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 80451ccb202SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 805*cd620004SJunchao Zhang ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 806*cd620004SJunchao Zhang ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 807eb02082bSJunchao Zhang if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;} 808eb02082bSJunchao Zhang #endif 809*cd620004SJunchao Zhang ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 810*cd620004SJunchao Zhang ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 811*cd620004SJunchao Zhang } 812eb02082bSJunchao Zhang ierr = PetscFree(link);CHKERRQ(ierr); 813eb02082bSJunchao Zhang } 814eb02082bSJunchao Zhang *avail = NULL; 815eb02082bSJunchao Zhang PetscFunctionReturn(0); 816eb02082bSJunchao Zhang } 817eb02082bSJunchao Zhang 818*cd620004SJunchao Zhang #if defined(PETSC_USE_DEBUG) 8199d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */ 820*cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 8219d1c8addSJunchao Zhang { 8229d1c8addSJunchao Zhang PetscErrorCode ierr; 823*cd620004SJunchao Zhang PetscSFLink link,*p; 8249d1c8addSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 8259d1c8addSJunchao Zhang PetscBool match; 8269d1c8addSJunchao Zhang 8279d1c8addSJunchao Zhang PetscFunctionBegin; 82818fb5014SJunchao Zhang /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 82918fb5014SJunchao Zhang the potential overlapping since this process does not participate in communication. Overlapping is harmless. 83018fb5014SJunchao Zhang */ 831637e6665SJunchao Zhang if (rootdata || leafdata) { 8329d1c8addSJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 8339d1c8addSJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 834*cd620004SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 8359d1c8addSJunchao Zhang } 83618fb5014SJunchao Zhang } 8379d1c8addSJunchao Zhang PetscFunctionReturn(0); 8389d1c8addSJunchao Zhang } 839*cd620004SJunchao Zhang #endif 8409d1c8addSJunchao Zhang 841*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 84240e23c03SJunchao Zhang { 84340e23c03SJunchao Zhang PetscErrorCode ierr; 844b23bfdefSJunchao Zhang PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 845b23bfdefSJunchao Zhang PetscBool is2Int,is2PetscInt; 84640e23c03SJunchao Zhang PetscMPIInt ni,na,nd,combiner; 84740e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 848b23bfdefSJunchao Zhang PetscInt nPetscComplex=0; 84940e23c03SJunchao Zhang #endif 85040e23c03SJunchao Zhang 85140e23c03SJunchao Zhang PetscFunctionBegin; 852b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 853b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 854b23bfdefSJunchao Zhang /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 855b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 856b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 857b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 85840e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 859b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 86040e23c03SJunchao Zhang #endif 86140e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 86240e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 863b23bfdefSJunchao Zhang /* TODO: shaell we also handle Fortran MPI_2REAL? */ 86440e23c03SJunchao Zhang ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 8655ad15460SJunchao Zhang link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 866b23bfdefSJunchao Zhang link->bs = 1; /* default */ 86740e23c03SJunchao Zhang 868eb02082bSJunchao Zhang if (is2Int) { 869*cd620004SJunchao Zhang PackInit_PairType_int_int_1_1(link); 870eb02082bSJunchao Zhang link->bs = 1; 871eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(int); 8725ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 873eb02082bSJunchao Zhang link->basicunit = MPI_2INT; 8745ad15460SJunchao Zhang link->unit = MPI_2INT; 875eb02082bSJunchao Zhang } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 876*cd620004SJunchao Zhang PackInit_PairType_PetscInt_PetscInt_1_1(link); 877eb02082bSJunchao Zhang link->bs = 1; 878eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(PetscInt); 879eb02082bSJunchao Zhang link->basicunit = MPIU_2INT; 8805ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 8815ad15460SJunchao Zhang link->unit = MPIU_2INT; 882eb02082bSJunchao Zhang } else if (nPetscReal) { 883b23bfdefSJunchao Zhang if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 884b23bfdefSJunchao Zhang else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 885b23bfdefSJunchao Zhang else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 886b23bfdefSJunchao Zhang else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 887b23bfdefSJunchao Zhang link->bs = nPetscReal; 888eb02082bSJunchao Zhang link->unitbytes = nPetscReal*sizeof(PetscReal); 88940e23c03SJunchao Zhang link->basicunit = MPIU_REAL; 8905ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 891b23bfdefSJunchao Zhang } else if (nPetscInt) { 892b23bfdefSJunchao Zhang if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 893b23bfdefSJunchao Zhang else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 894b23bfdefSJunchao Zhang else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 895b23bfdefSJunchao Zhang else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 896b23bfdefSJunchao Zhang link->bs = nPetscInt; 897eb02082bSJunchao Zhang link->unitbytes = nPetscInt*sizeof(PetscInt); 898b23bfdefSJunchao Zhang link->basicunit = MPIU_INT; 8995ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 900b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) 901b23bfdefSJunchao Zhang } else if (nInt) { 902b23bfdefSJunchao Zhang if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 903b23bfdefSJunchao Zhang else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 904b23bfdefSJunchao Zhang else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 905b23bfdefSJunchao Zhang else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 906b23bfdefSJunchao Zhang link->bs = nInt; 907eb02082bSJunchao Zhang link->unitbytes = nInt*sizeof(int); 908b23bfdefSJunchao Zhang link->basicunit = MPI_INT; 9095ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 910b23bfdefSJunchao Zhang #endif 911b23bfdefSJunchao Zhang } else if (nSignedChar) { 912b23bfdefSJunchao Zhang if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 913b23bfdefSJunchao Zhang else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 914b23bfdefSJunchao Zhang else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 915b23bfdefSJunchao Zhang else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 916b23bfdefSJunchao Zhang link->bs = nSignedChar; 917eb02082bSJunchao Zhang link->unitbytes = nSignedChar*sizeof(SignedChar); 918b23bfdefSJunchao Zhang link->basicunit = MPI_SIGNED_CHAR; 9195ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 920b23bfdefSJunchao Zhang } else if (nUnsignedChar) { 921b23bfdefSJunchao Zhang if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 922b23bfdefSJunchao Zhang else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 923b23bfdefSJunchao Zhang else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 924b23bfdefSJunchao Zhang else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 925b23bfdefSJunchao Zhang link->bs = nUnsignedChar; 926eb02082bSJunchao Zhang link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 927b23bfdefSJunchao Zhang link->basicunit = MPI_UNSIGNED_CHAR; 9285ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 92940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 930b23bfdefSJunchao Zhang } else if (nPetscComplex) { 931b23bfdefSJunchao Zhang if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 932b23bfdefSJunchao Zhang else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 933b23bfdefSJunchao Zhang else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 934b23bfdefSJunchao Zhang else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 935b23bfdefSJunchao Zhang link->bs = nPetscComplex; 936eb02082bSJunchao Zhang link->unitbytes = nPetscComplex*sizeof(PetscComplex); 93740e23c03SJunchao Zhang link->basicunit = MPIU_COMPLEX; 9385ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 93940e23c03SJunchao Zhang #endif 94040e23c03SJunchao Zhang } else { 941b23bfdefSJunchao Zhang MPI_Aint lb,nbyte; 942b23bfdefSJunchao Zhang ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 94340e23c03SJunchao Zhang if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 944eb02082bSJunchao Zhang if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 945eb02082bSJunchao Zhang if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 946eb02082bSJunchao Zhang else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 947eb02082bSJunchao Zhang else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 948eb02082bSJunchao Zhang link->bs = nbyte; 949b23bfdefSJunchao Zhang link->unitbytes = nbyte; 950b23bfdefSJunchao Zhang link->basicunit = MPI_BYTE; 95140e23c03SJunchao Zhang } else { 952eb02082bSJunchao Zhang nInt = nbyte / sizeof(int); 953eb02082bSJunchao Zhang if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 954eb02082bSJunchao Zhang else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 955eb02082bSJunchao Zhang else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 956eb02082bSJunchao Zhang else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 957eb02082bSJunchao Zhang link->bs = nInt; 958b23bfdefSJunchao Zhang link->unitbytes = nbyte; 95940e23c03SJunchao Zhang link->basicunit = MPI_INT; 96040e23c03SJunchao Zhang } 9615ad15460SJunchao Zhang if (link->isbuiltin) link->unit = unit; 96240e23c03SJunchao Zhang } 963b23bfdefSJunchao Zhang 9645ad15460SJunchao Zhang if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 96540e23c03SJunchao Zhang PetscFunctionReturn(0); 96640e23c03SJunchao Zhang } 96740e23c03SJunchao Zhang 968*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*)) 96940e23c03SJunchao Zhang { 97040e23c03SJunchao Zhang PetscFunctionBegin; 97140e23c03SJunchao Zhang *UnpackAndOp = NULL; 972eb02082bSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) { 973eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 974eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 975eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 976eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 977eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 978eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 979eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 980eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 981eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 982eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 983eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 984eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 985eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 986eb02082bSJunchao Zhang } 987eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 988eb02082bSJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 989eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 990eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 991eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 992eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 993eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 994eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 995eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 996eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 997eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 998eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 999eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 1000eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 1001eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 1002eb02082bSJunchao Zhang } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1003eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 1004eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 1005eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 1006eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 1007eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 1008eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 1009eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 1010eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 1011eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 1012eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 1013eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 1014eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 1015eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 1016eb02082bSJunchao Zhang } 1017eb02082bSJunchao Zhang #endif 101840e23c03SJunchao Zhang PetscFunctionReturn(0); 101940e23c03SJunchao Zhang } 102040e23c03SJunchao Zhang 1021*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*)) 102240e23c03SJunchao Zhang { 102340e23c03SJunchao Zhang PetscFunctionBegin; 1024*cd620004SJunchao Zhang *ScatterAndOp = NULL; 1025eb02082bSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) { 1026*cd620004SJunchao Zhang if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 1027*cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 1028*cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 1029*cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 1030*cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 1031*cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 1032*cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 1033*cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 1034*cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 1035*cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 1036*cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 1037*cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 1038*cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 1039eb02082bSJunchao Zhang } 1040eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1041eb02082bSJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 1042*cd620004SJunchao Zhang if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 1043*cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 1044*cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 1045*cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 1046*cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 1047*cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 1048*cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 1049*cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 1050*cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 1051*cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 1052*cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 1053*cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 1054*cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 1055eb02082bSJunchao Zhang } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1056*cd620004SJunchao Zhang if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 1057*cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 1058*cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 1059*cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 1060*cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 1061*cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 1062*cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 1063*cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 1064*cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 1065*cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 1066*cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 1067*cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 1068*cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 1069eb02082bSJunchao Zhang } 1070eb02082bSJunchao Zhang #endif 1071*cd620004SJunchao Zhang PetscFunctionReturn(0); 1072*cd620004SJunchao Zhang } 1073*cd620004SJunchao Zhang 1074*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*)) 1075*cd620004SJunchao Zhang { 1076*cd620004SJunchao Zhang PetscFunctionBegin; 1077*cd620004SJunchao Zhang *FetchAndOp = NULL; 1078*cd620004SJunchao Zhang if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1079*cd620004SJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; 1080*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1081*cd620004SJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; 1082*cd620004SJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; 1083*cd620004SJunchao Zhang #endif 1084*cd620004SJunchao Zhang PetscFunctionReturn(0); 1085*cd620004SJunchao Zhang } 1086*cd620004SJunchao Zhang 1087*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*)) 1088*cd620004SJunchao Zhang { 1089*cd620004SJunchao Zhang PetscFunctionBegin; 1090*cd620004SJunchao Zhang *FetchAndOpLocal = NULL; 1091*cd620004SJunchao Zhang if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1092*cd620004SJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; 1093*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1094*cd620004SJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 1095*cd620004SJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 1096*cd620004SJunchao Zhang #endif 1097*cd620004SJunchao Zhang PetscFunctionReturn(0); 1098*cd620004SJunchao Zhang } 1099*cd620004SJunchao Zhang 1100*cd620004SJunchao Zhang /*============================================================================= 1101*cd620004SJunchao Zhang A set of helper routines for Pack/Unpack/Scatter on GPUs 1102*cd620004SJunchao Zhang ============================================================================*/ 1103*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1104*cd620004SJunchao Zhang /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to 1105*cd620004SJunchao Zhang make sure the data is ready for packing. 1106*cd620004SJunchao Zhang */ 1107*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link) 1108*cd620004SJunchao Zhang { 1109*cd620004SJunchao Zhang PetscFunctionBegin; 1110*cd620004SJunchao Zhang if (sf->use_default_stream) PetscFunctionReturn(0); 1111*cd620004SJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) { 1112*cd620004SJunchao Zhang cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr); 1113*cd620004SJunchao Zhang } 1114*cd620004SJunchao Zhang PetscFunctionReturn(0); 1115*cd620004SJunchao Zhang } 1116*cd620004SJunchao Zhang 1117*cd620004SJunchao Zhang /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */ 1118*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link) 1119*cd620004SJunchao Zhang { 1120*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1121*cd620004SJunchao Zhang 1122*cd620004SJunchao Zhang PetscFunctionBegin; 1123*cd620004SJunchao Zhang /* rootdata is on device && send non-empty to remote && (we called a packing kernel || we async-copied rootdata from device to host) */ 1124*cd620004SJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE && bas->rootbuflen[PETSCSF_REMOTE] && (!link->rootdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi)) { 1125*cd620004SJunchao Zhang cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1126*cd620004SJunchao Zhang } 1127*cd620004SJunchao Zhang PetscFunctionReturn(0); 1128*cd620004SJunchao Zhang } 1129*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link) 1130*cd620004SJunchao Zhang { 1131*cd620004SJunchao Zhang PetscFunctionBegin; 1132*cd620004SJunchao Zhang if (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->leafbuflen[PETSCSF_REMOTE] && (!link->leafdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi)) { 1133*cd620004SJunchao Zhang cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1134*cd620004SJunchao Zhang } 1135*cd620004SJunchao Zhang PetscFunctionReturn(0); 1136*cd620004SJunchao Zhang } 1137*cd620004SJunchao Zhang 1138*cd620004SJunchao Zhang /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF 1139*cd620004SJunchao Zhang does not know which stream the callers will use. 1140*cd620004SJunchao Zhang */ 1141*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link) 1142*cd620004SJunchao Zhang { 1143*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1144*cd620004SJunchao Zhang 1145*cd620004SJunchao Zhang PetscFunctionBegin; 1146*cd620004SJunchao Zhang if (sf->use_default_stream) PetscFunctionReturn(0); 1147*cd620004SJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (bas->rootbuflen[PETSCSF_LOCAL] || bas->rootbuflen[PETSCSF_REMOTE])) { 1148*cd620004SJunchao Zhang cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1149*cd620004SJunchao Zhang } 1150*cd620004SJunchao Zhang PetscFunctionReturn(0); 1151*cd620004SJunchao Zhang } 1152*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link) 1153*cd620004SJunchao Zhang { 1154*cd620004SJunchao Zhang PetscFunctionBegin; 1155*cd620004SJunchao Zhang if (sf->use_default_stream) PetscFunctionReturn(0); 1156*cd620004SJunchao Zhang if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (sf->leafbuflen[PETSCSF_LOCAL] || sf->leafbuflen[PETSCSF_REMOTE])) { 1157*cd620004SJunchao Zhang cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1158*cd620004SJunchao Zhang } 1159*cd620004SJunchao Zhang PetscFunctionReturn(0); 1160*cd620004SJunchao Zhang } 1161*cd620004SJunchao Zhang 1162*cd620004SJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 1163*cd620004SJunchao Zhang to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 1164*cd620004SJunchao Zhang */ 1165*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1166*cd620004SJunchao Zhang { 1167*cd620004SJunchao Zhang PetscErrorCode ierr; 1168*cd620004SJunchao Zhang cudaError_t cerr; 1169*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1170*cd620004SJunchao Zhang 1171*cd620004SJunchao Zhang PetscFunctionBegin; 1172*cd620004SJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) { 1173*cd620004SJunchao Zhang void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1174*cd620004SJunchao Zhang void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1175*cd620004SJunchao Zhang size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes; 1176*cd620004SJunchao Zhang if (device2host) { 1177*cd620004SJunchao Zhang cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1178*cd620004SJunchao Zhang ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1179*cd620004SJunchao Zhang } else { 1180*cd620004SJunchao Zhang cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1181*cd620004SJunchao Zhang ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1182*cd620004SJunchao Zhang } 1183*cd620004SJunchao Zhang } 1184*cd620004SJunchao Zhang PetscFunctionReturn(0); 1185*cd620004SJunchao Zhang } 1186*cd620004SJunchao Zhang 1187*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1188*cd620004SJunchao Zhang { 1189*cd620004SJunchao Zhang PetscErrorCode ierr; 1190*cd620004SJunchao Zhang cudaError_t cerr; 1191*cd620004SJunchao Zhang 1192*cd620004SJunchao Zhang PetscFunctionBegin; 1193*cd620004SJunchao Zhang if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) { 1194*cd620004SJunchao Zhang void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1195*cd620004SJunchao Zhang void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1196*cd620004SJunchao Zhang size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes; 1197*cd620004SJunchao Zhang if (device2host) { 1198*cd620004SJunchao Zhang cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1199*cd620004SJunchao Zhang ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1200*cd620004SJunchao Zhang } else { 1201*cd620004SJunchao Zhang cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1202*cd620004SJunchao Zhang ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1203*cd620004SJunchao Zhang } 1204*cd620004SJunchao Zhang } 1205*cd620004SJunchao Zhang PetscFunctionReturn(0); 1206*cd620004SJunchao Zhang } 1207*cd620004SJunchao Zhang #else 1208*cd620004SJunchao Zhang 1209*cd620004SJunchao Zhang #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0 1210*cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0 1211*cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0 1212*cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0 1213*cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0 1214*cd620004SJunchao Zhang #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1215*cd620004SJunchao Zhang #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1216*cd620004SJunchao Zhang 1217*cd620004SJunchao Zhang #endif 1218*cd620004SJunchao Zhang 1219*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1220*cd620004SJunchao Zhang { 1221*cd620004SJunchao Zhang PetscErrorCode ierr; 1222*cd620004SJunchao Zhang PetscLogDouble flops; 1223*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1224*cd620004SJunchao Zhang 1225*cd620004SJunchao Zhang 1226*cd620004SJunchao Zhang PetscFunctionBegin; 1227*cd620004SJunchao Zhang if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1228*cd620004SJunchao Zhang flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1229*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1230*cd620004SJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1231*cd620004SJunchao Zhang #endif 1232*cd620004SJunchao Zhang {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1233*cd620004SJunchao Zhang } 1234*cd620004SJunchao Zhang PetscFunctionReturn(0); 1235*cd620004SJunchao Zhang } 1236*cd620004SJunchao Zhang 1237*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1238*cd620004SJunchao Zhang { 1239*cd620004SJunchao Zhang PetscLogDouble flops; 1240*cd620004SJunchao Zhang PetscErrorCode ierr; 1241*cd620004SJunchao Zhang 1242*cd620004SJunchao Zhang PetscFunctionBegin; 1243*cd620004SJunchao Zhang if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1244*cd620004SJunchao Zhang flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1245*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1246*cd620004SJunchao Zhang if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1247*cd620004SJunchao Zhang #endif 1248*cd620004SJunchao Zhang {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1249*cd620004SJunchao Zhang } 1250*cd620004SJunchao Zhang PetscFunctionReturn(0); 1251*cd620004SJunchao Zhang } 1252*cd620004SJunchao Zhang 1253*cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 1254*cd620004SJunchao Zhang Input Arguments: 1255*cd620004SJunchao Zhang +sf - The StarForest 1256*cd620004SJunchao Zhang .link - The link 1257*cd620004SJunchao Zhang .count - Number of entries to unpack 1258*cd620004SJunchao Zhang .start - The first index, significent when indices=NULL 1259*cd620004SJunchao Zhang .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 1260*cd620004SJunchao Zhang .buf - A contiguous buffer to unpack from 1261*cd620004SJunchao Zhang -op - Operation after unpack 1262*cd620004SJunchao Zhang 1263*cd620004SJunchao Zhang Output Arguments: 1264*cd620004SJunchao Zhang .data - The data to unpack to 1265*cd620004SJunchao Zhang */ 1266*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 1267*cd620004SJunchao Zhang { 1268*cd620004SJunchao Zhang PetscFunctionBegin; 1269*cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1270*cd620004SJunchao Zhang { 1271*cd620004SJunchao Zhang PetscErrorCode ierr; 1272*cd620004SJunchao Zhang PetscInt i; 1273*cd620004SJunchao Zhang PetscMPIInt n; 1274*cd620004SJunchao Zhang if (indices) { 1275*cd620004SJunchao Zhang /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 1276*cd620004SJunchao Zhang basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 1277*cd620004SJunchao Zhang */ 1278*cd620004SJunchao Zhang for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 1279*cd620004SJunchao Zhang } else { 1280*cd620004SJunchao Zhang ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 1281*cd620004SJunchao Zhang ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr); 1282*cd620004SJunchao Zhang } 1283*cd620004SJunchao Zhang } 1284*cd620004SJunchao Zhang #else 1285*cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1286*cd620004SJunchao Zhang #endif 1287*cd620004SJunchao Zhang PetscFunctionReturn(0); 1288*cd620004SJunchao Zhang } 1289*cd620004SJunchao Zhang 1290*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata,MPI_Op op) 1291*cd620004SJunchao Zhang { 1292*cd620004SJunchao Zhang PetscFunctionBegin; 1293*cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1294*cd620004SJunchao Zhang { 1295*cd620004SJunchao Zhang PetscErrorCode ierr; 1296*cd620004SJunchao Zhang PetscInt i,disp; 1297*cd620004SJunchao Zhang for (i=0; i<count; i++) { 1298*cd620004SJunchao Zhang disp = idy? idy[i] : starty + i; 1299*cd620004SJunchao Zhang ierr = MPI_Reduce_local((const char*)xdata+idx[i]*link->unitbytes,(char*)ydata+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr); 1300*cd620004SJunchao Zhang } 1301*cd620004SJunchao Zhang } 1302*cd620004SJunchao Zhang #else 1303*cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1304*cd620004SJunchao Zhang #endif 1305*cd620004SJunchao Zhang PetscFunctionReturn(0); 1306*cd620004SJunchao Zhang } 1307*cd620004SJunchao Zhang 1308*cd620004SJunchao Zhang /*============================================================================= 1309*cd620004SJunchao Zhang Pack/Unpack/Fetch/Scatter routines 1310*cd620004SJunchao Zhang ============================================================================*/ 1311*cd620004SJunchao Zhang 1312*cd620004SJunchao Zhang /* Pack rootdata to rootbuf 1313*cd620004SJunchao Zhang Input Arguments: 1314*cd620004SJunchao Zhang + sf - The SF this packing works on. 1315*cd620004SJunchao Zhang . link - It gives the memtype of the roots and also provides root buffer. 1316*cd620004SJunchao Zhang . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1317*cd620004SJunchao Zhang - rootdata - Where to read the roots. 1318*cd620004SJunchao Zhang 1319*cd620004SJunchao Zhang Notes: 1320*cd620004SJunchao Zhang When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1321*cd620004SJunchao Zhang in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) 1322*cd620004SJunchao Zhang */ 1323*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1324*cd620004SJunchao Zhang { 1325*cd620004SJunchao Zhang PetscErrorCode ierr; 1326*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1327*cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1328*cd620004SJunchao Zhang PetscInt count,start; 1329*cd620004SJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL; 1330*cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1331*cd620004SJunchao Zhang 1332*cd620004SJunchao Zhang PetscFunctionBegin; 1333*cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1334*cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1335*cd620004SJunchao Zhang if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1336*cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1337*cd620004SJunchao Zhang ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1338*cd620004SJunchao Zhang ierr = (*Pack)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1339*cd620004SJunchao Zhang } 1340*cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) { 1341*cd620004SJunchao Zhang ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1342*cd620004SJunchao Zhang ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); 1343*cd620004SJunchao Zhang } 1344*cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1345*cd620004SJunchao Zhang PetscFunctionReturn(0); 1346*cd620004SJunchao Zhang } 1347*cd620004SJunchao Zhang 1348*cd620004SJunchao Zhang /* Pack leafdata to leafbuf */ 1349*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1350*cd620004SJunchao Zhang { 1351*cd620004SJunchao Zhang PetscErrorCode ierr; 1352*cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 1353*cd620004SJunchao Zhang PetscInt count,start; 1354*cd620004SJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL; 1355*cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 1356*cd620004SJunchao Zhang 1357*cd620004SJunchao Zhang PetscFunctionBegin; 1358*cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1359*cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1360*cd620004SJunchao Zhang if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1361*cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr); 1362*cd620004SJunchao Zhang ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1363*cd620004SJunchao Zhang ierr = (*Pack)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1364*cd620004SJunchao Zhang } 1365*cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) { 1366*cd620004SJunchao Zhang ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1367*cd620004SJunchao Zhang ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); 1368*cd620004SJunchao Zhang } 1369*cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1370*cd620004SJunchao Zhang PetscFunctionReturn(0); 1371*cd620004SJunchao Zhang } 1372*cd620004SJunchao Zhang 1373*cd620004SJunchao Zhang /* Unpack rootbuf to rootdata */ 1374*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1375*cd620004SJunchao Zhang { 1376*cd620004SJunchao Zhang PetscErrorCode ierr; 1377*cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1378*cd620004SJunchao Zhang PetscInt count,start; 1379*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1380*cd620004SJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1381*cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1382*cd620004SJunchao Zhang 1383*cd620004SJunchao Zhang PetscFunctionBegin; 1384*cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1385*cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1386*cd620004SJunchao Zhang if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1387*cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1388*cd620004SJunchao Zhang if (UnpackAndOp) { 1389*cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1390*cd620004SJunchao Zhang ierr = (*UnpackAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1391*cd620004SJunchao Zhang } else { 1392*cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1393*cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1394*cd620004SJunchao Zhang } 1395*cd620004SJunchao Zhang } 1396*cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} 1397*cd620004SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1398*cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1399*cd620004SJunchao Zhang PetscFunctionReturn(0); 1400*cd620004SJunchao Zhang } 1401*cd620004SJunchao Zhang 1402*cd620004SJunchao Zhang /* Unpack leafbuf to leafdata */ 1403*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1404*cd620004SJunchao Zhang { 1405*cd620004SJunchao Zhang PetscErrorCode ierr; 1406*cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 1407*cd620004SJunchao Zhang PetscInt count,start; 1408*cd620004SJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1409*cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 1410*cd620004SJunchao Zhang 1411*cd620004SJunchao Zhang PetscFunctionBegin; 1412*cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1413*cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1414*cd620004SJunchao Zhang if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1415*cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1416*cd620004SJunchao Zhang if (UnpackAndOp) { 1417*cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr); 1418*cd620004SJunchao Zhang ierr = (*UnpackAndOp)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1419*cd620004SJunchao Zhang } else { 1420*cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&leafindices);CHKERRQ(ierr); 1421*cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1422*cd620004SJunchao Zhang } 1423*cd620004SJunchao Zhang } 1424*cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} 1425*cd620004SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1426*cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1427*cd620004SJunchao Zhang PetscFunctionReturn(0); 1428*cd620004SJunchao Zhang } 1429*cd620004SJunchao Zhang 1430*cd620004SJunchao Zhang /* FetchAndOp rootdata with rootbuf */ 1431*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1432*cd620004SJunchao Zhang { 1433*cd620004SJunchao Zhang PetscErrorCode ierr; 1434*cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1435*cd620004SJunchao Zhang PetscInt count,start; 1436*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1437*cd620004SJunchao Zhang PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*) = NULL; 1438*cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1439*cd620004SJunchao Zhang 1440*cd620004SJunchao Zhang PetscFunctionBegin; 1441*cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1442*cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1443*cd620004SJunchao Zhang if (bas->rootbuflen[scope]) { 1444*cd620004SJunchao Zhang /* Do FetchAndOp on rootdata with rootbuf */ 1445*cd620004SJunchao Zhang ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); 1446*cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1447*cd620004SJunchao Zhang ierr = (*FetchAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1448*cd620004SJunchao Zhang } 1449*cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) { 1450*cd620004SJunchao Zhang ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 1451*cd620004SJunchao Zhang ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); 1452*cd620004SJunchao Zhang } 1453*cd620004SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1454*cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1455*cd620004SJunchao Zhang PetscFunctionReturn(0); 1456*cd620004SJunchao Zhang } 1457*cd620004SJunchao Zhang 1458*cd620004SJunchao Zhang /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ 1459*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) 1460*cd620004SJunchao Zhang { 1461*cd620004SJunchao Zhang PetscErrorCode ierr; 1462*cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1463*cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1464*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1465*cd620004SJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1466*cd620004SJunchao Zhang PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL; 1467*cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1468*cd620004SJunchao Zhang 1469*cd620004SJunchao Zhang PetscFunctionBegin; 1470*cd620004SJunchao Zhang if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1471*cd620004SJunchao Zhang if (rootmtype != leafmtype) { /* Uncommon case */ 1472*cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */ 1473*cd620004SJunchao Zhang ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1474*cd620004SJunchao Zhang ierr = PetscMemcpyWithMemType(leafmtype,rootmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1475*cd620004SJunchao Zhang ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1476*cd620004SJunchao Zhang } else { 1477*cd620004SJunchao Zhang if (bas->rootcontig[PETSCSF_LOCAL]) { /* If root indices are contiguous, Scatter becomes Unpack */ 1478*cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr); 1479*cd620004SJunchao Zhang rootdata = (const char*)rootdata + bas->rootstart[PETSCSF_LOCAL]*link->unitbytes; /* Make rootdata point to start of the buffer */ 1480*cd620004SJunchao Zhang if (UnpackAndOp) { 1481*cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr); 1482*cd620004SJunchao Zhang ierr = (*UnpackAndOp)(link,count,leafstart,leafindices,sf->leafpackopt[PETSCSF_LOCAL],leafdata,rootdata);CHKERRQ(ierr); 1483*cd620004SJunchao Zhang } else { 1484*cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr); 1485*cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootdata,op);CHKERRQ(ierr); 1486*cd620004SJunchao Zhang } 1487*cd620004SJunchao Zhang } else { /* ScatterAndOp */ 1488*cd620004SJunchao Zhang ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1489*cd620004SJunchao Zhang if (ScatterAndOp) { 1490*cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1491*cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr); 1492*cd620004SJunchao Zhang ierr = (*ScatterAndOp)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata);CHKERRQ(ierr); 1493*cd620004SJunchao Zhang } else { 1494*cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,NULL,&rootindices);CHKERRQ(ierr); 1495*cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr); 1496*cd620004SJunchao Zhang ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1497*cd620004SJunchao Zhang } 1498*cd620004SJunchao Zhang } 1499*cd620004SJunchao Zhang } 1500*cd620004SJunchao Zhang PetscFunctionReturn(0); 1501*cd620004SJunchao Zhang } 1502*cd620004SJunchao Zhang 1503*cd620004SJunchao Zhang /* Reduce leafdata to rootdata locally */ 1504*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) 1505*cd620004SJunchao Zhang { 1506*cd620004SJunchao Zhang PetscErrorCode ierr; 1507*cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1508*cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1509*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1510*cd620004SJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1511*cd620004SJunchao Zhang PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL; 1512*cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1513*cd620004SJunchao Zhang 1514*cd620004SJunchao Zhang PetscFunctionBegin; 1515*cd620004SJunchao Zhang if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1516*cd620004SJunchao Zhang if (rootmtype != leafmtype) { 1517*cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */ 1518*cd620004SJunchao Zhang ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1519*cd620004SJunchao Zhang ierr = PetscMemcpyWithMemType(rootmtype,leafmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1520*cd620004SJunchao Zhang ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1521*cd620004SJunchao Zhang } else { 1522*cd620004SJunchao Zhang if (sf->leafcontig[PETSCSF_LOCAL]) { 1523*cd620004SJunchao Zhang /* If leaf indices are contiguous, Scatter becomes Unpack */ 1524*cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr); 1525*cd620004SJunchao Zhang leafdata = (const char*)leafdata + sf->leafstart[PETSCSF_LOCAL]*link->unitbytes; /* Make leafdata point to start of the buffer */ 1526*cd620004SJunchao Zhang if (UnpackAndOp) { 1527*cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1528*cd620004SJunchao Zhang ierr = (*UnpackAndOp)(link,count,rootstart,rootindices,bas->rootpackopt[PETSCSF_LOCAL],rootdata,leafdata);CHKERRQ(ierr); 1529*cd620004SJunchao Zhang } else { 1530*cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1531*cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafdata,op);CHKERRQ(ierr); 1532*cd620004SJunchao Zhang } 1533*cd620004SJunchao Zhang } else { 1534*cd620004SJunchao Zhang ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1535*cd620004SJunchao Zhang if (ScatterAndOp) { 1536*cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1537*cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr); 1538*cd620004SJunchao Zhang ierr = (*ScatterAndOp)(link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata);CHKERRQ(ierr); 1539*cd620004SJunchao Zhang } else { 1540*cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1541*cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,NULL,&leafindices);CHKERRQ(ierr); 1542*cd620004SJunchao Zhang ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1543*cd620004SJunchao Zhang } 1544*cd620004SJunchao Zhang } 1545*cd620004SJunchao Zhang } 1546*cd620004SJunchao Zhang PetscFunctionReturn(0); 1547*cd620004SJunchao Zhang } 1548*cd620004SJunchao Zhang 1549*cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */ 1550*cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1551*cd620004SJunchao Zhang { 1552*cd620004SJunchao Zhang PetscErrorCode ierr; 1553*cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1554*cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1555*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1556*cd620004SJunchao Zhang PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*) = NULL; 1557*cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1558*cd620004SJunchao Zhang 1559*cd620004SJunchao Zhang PetscFunctionBegin; 1560*cd620004SJunchao Zhang if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1561*cd620004SJunchao Zhang if (rootmtype != leafmtype) { 1562*cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */ 1563*cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1564*cd620004SJunchao Zhang } else { 1565*cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1566*cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr); 1567*cd620004SJunchao Zhang ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1568*cd620004SJunchao Zhang ierr = (*FetchAndOpLocal)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1569*cd620004SJunchao Zhang } 157040e23c03SJunchao Zhang PetscFunctionReturn(0); 157140e23c03SJunchao Zhang } 157240e23c03SJunchao Zhang 157340e23c03SJunchao Zhang /* 1574*cd620004SJunchao Zhang Create per-rank pack/unpack optimizations based on indice patterns 157540e23c03SJunchao Zhang 157640e23c03SJunchao Zhang Input Parameters: 1577b23bfdefSJunchao Zhang + n - Number of target ranks 1578eb02082bSJunchao Zhang . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1579b23bfdefSJunchao Zhang - idx - [*] Array storing indices 158040e23c03SJunchao Zhang 158140e23c03SJunchao Zhang Output Parameters: 1582*cd620004SJunchao Zhang + opt - Pack optimizations. NULL if no optimizations. 158340e23c03SJunchao Zhang */ 1584*cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 158540e23c03SJunchao Zhang { 158640e23c03SJunchao Zhang PetscErrorCode ierr; 158740e23c03SJunchao Zhang PetscInt i,j,k,n_copies,tot_copies = 0,step; 1588b23bfdefSJunchao Zhang PetscBool strided,optimized = PETSC_FALSE; 158940e23c03SJunchao Zhang PetscSFPackOpt opt; 159040e23c03SJunchao Zhang 159140e23c03SJunchao Zhang PetscFunctionBegin; 1592b23bfdefSJunchao Zhang if (!n) { 1593b23bfdefSJunchao Zhang *out = NULL; 1594b23bfdefSJunchao Zhang PetscFunctionReturn(0); 1595b23bfdefSJunchao Zhang } 1596b23bfdefSJunchao Zhang 159740e23c03SJunchao Zhang ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr); 1598b23bfdefSJunchao Zhang ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr); 1599b23bfdefSJunchao Zhang ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr); 1600*cd620004SJunchao Zhang /* Make opt->offset[] zero-based. If one calls this routine with non-zero offset[0], one should use packing routine in this way, Pack(count,idx+offset[0],packopt,...) */ 1601*cd620004SJunchao Zhang if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} 1602b23bfdefSJunchao Zhang 1603b23bfdefSJunchao Zhang opt->n = n; 160440e23c03SJunchao Zhang 1605fb61b9e4SStefano Zampini /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with multiple memcpy's ) */ 160640e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each target processor */ 160740e23c03SJunchao Zhang /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */ 160840e23c03SJunchao Zhang n_copies = 1; 160940e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 161040e23c03SJunchao Zhang if (idx[j]+1 != idx[j+1]) n_copies++; 161140e23c03SJunchao Zhang } 161240e23c03SJunchao Zhang /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32, 161340e23c03SJunchao Zhang then it is worth using memcpy for this target. 32 is an arbitrarily chosen number. 161440e23c03SJunchao Zhang */ 161540e23c03SJunchao Zhang if ((offset[i+1]-offset[i])/n_copies >= 32) { 1616b23bfdefSJunchao Zhang opt->type[i] = PETSCSF_PACKOPT_MULTICOPY; 1617b23bfdefSJunchao Zhang optimized = PETSC_TRUE; 161840e23c03SJunchao Zhang tot_copies += n_copies; 161940e23c03SJunchao Zhang } 162040e23c03SJunchao Zhang } 162140e23c03SJunchao Zhang 162240e23c03SJunchao Zhang /* Setup memcpy plan for each contiguous piece */ 162340e23c03SJunchao Zhang k = 0; /* k-th copy */ 1624b23bfdefSJunchao Zhang ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr); 1625b23bfdefSJunchao Zhang for (i=0; i<n; i++) { /* for each target processor */ 1626b23bfdefSJunchao Zhang if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) { 162740e23c03SJunchao Zhang n_copies = 1; 1628b23bfdefSJunchao Zhang opt->copy_start[k] = offset[i] - offset[0]; 162940e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 163040e23c03SJunchao Zhang if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */ 163140e23c03SJunchao Zhang n_copies++; 1632b23bfdefSJunchao Zhang opt->copy_start[k+1] = j-offset[0]+1; 1633b23bfdefSJunchao Zhang opt->copy_length[k] = opt->copy_start[k+1] - opt->copy_start[k]; 163440e23c03SJunchao Zhang k++; 163540e23c03SJunchao Zhang } 163640e23c03SJunchao Zhang } 163740e23c03SJunchao Zhang /* Set copy length of the last copy for this target */ 1638b23bfdefSJunchao Zhang opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k]; 163940e23c03SJunchao Zhang k++; 164040e23c03SJunchao Zhang } 1641b23bfdefSJunchao Zhang /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */ 164240e23c03SJunchao Zhang opt->copy_offset[i+1] = k; 164340e23c03SJunchao Zhang } 164440e23c03SJunchao Zhang 164540e23c03SJunchao Zhang /* Last chance! If the indices do not have long contiguous pieces, are they strided? */ 164640e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each remote */ 1647b23bfdefSJunchao Zhang if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */ 164840e23c03SJunchao Zhang strided = PETSC_TRUE; 164940e23c03SJunchao Zhang step = idx[offset[i]+1] - idx[offset[i]]; 165040e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 165140e23c03SJunchao Zhang if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; } 165240e23c03SJunchao Zhang } 165340e23c03SJunchao Zhang if (strided) { 1654b23bfdefSJunchao Zhang opt->type[i] = PETSCSF_PACKOPT_STRIDE; 165540e23c03SJunchao Zhang opt->stride_step[i] = step; 165640e23c03SJunchao Zhang opt->stride_n[i] = offset[i+1] - offset[i]; 1657b23bfdefSJunchao Zhang optimized = PETSC_TRUE; 165840e23c03SJunchao Zhang } 165940e23c03SJunchao Zhang } 166040e23c03SJunchao Zhang } 1661b23bfdefSJunchao Zhang /* If no rank gets optimized, free arrays to save memory */ 1662b23bfdefSJunchao Zhang if (!optimized) { 1663b23bfdefSJunchao Zhang ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 1664b23bfdefSJunchao Zhang ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 166540e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 166640e23c03SJunchao Zhang *out = NULL; 166740e23c03SJunchao Zhang } else *out = opt; 166840e23c03SJunchao Zhang PetscFunctionReturn(0); 166940e23c03SJunchao Zhang } 167040e23c03SJunchao Zhang 1671*cd620004SJunchao Zhang PetscErrorCode PetscSFDestroyPackOpt(PetscSFPackOpt *out) 167240e23c03SJunchao Zhang { 167340e23c03SJunchao Zhang PetscErrorCode ierr; 167440e23c03SJunchao Zhang PetscSFPackOpt opt = *out; 167540e23c03SJunchao Zhang 167640e23c03SJunchao Zhang PetscFunctionBegin; 167740e23c03SJunchao Zhang if (opt) { 1678b23bfdefSJunchao Zhang ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 1679b23bfdefSJunchao Zhang ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 168040e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 168140e23c03SJunchao Zhang *out = NULL; 168240e23c03SJunchao Zhang } 168340e23c03SJunchao Zhang PetscFunctionReturn(0); 168440e23c03SJunchao Zhang } 1685*cd620004SJunchao Zhang 1686*cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1687*cd620004SJunchao Zhang { 1688*cd620004SJunchao Zhang PetscErrorCode ierr; 1689*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1690*cd620004SJunchao Zhang PetscInt i,j; 1691*cd620004SJunchao Zhang 1692*cd620004SJunchao Zhang PetscFunctionBegin; 1693*cd620004SJunchao Zhang /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1694*cd620004SJunchao Zhang for (i=0; i<2; i++) { /* Set defaults */ 1695*cd620004SJunchao Zhang sf->leafstart[i] = 0; 1696*cd620004SJunchao Zhang sf->leafcontig[i] = PETSC_TRUE; 1697*cd620004SJunchao Zhang sf->leafdups[i] = PETSC_FALSE; 1698*cd620004SJunchao Zhang bas->rootstart[i] = 0; 1699*cd620004SJunchao Zhang bas->rootcontig[i] = PETSC_TRUE; 1700*cd620004SJunchao Zhang bas->rootdups[i] = PETSC_FALSE; 1701*cd620004SJunchao Zhang } 1702*cd620004SJunchao Zhang 1703*cd620004SJunchao Zhang sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1704*cd620004SJunchao Zhang sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1705*cd620004SJunchao Zhang 1706*cd620004SJunchao Zhang if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1707*cd620004SJunchao Zhang if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1708*cd620004SJunchao Zhang 1709*cd620004SJunchao Zhang /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1710*cd620004SJunchao Zhang for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1711*cd620004SJunchao Zhang if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1712*cd620004SJunchao Zhang } 1713*cd620004SJunchao Zhang for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1714*cd620004SJunchao Zhang if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1715*cd620004SJunchao Zhang } 1716*cd620004SJunchao Zhang 1717*cd620004SJunchao Zhang /* If not, see if we can have per-rank optimizations by doing index analysis */ 1718*cd620004SJunchao Zhang if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1719*cd620004SJunchao Zhang if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1720*cd620004SJunchao Zhang 1721*cd620004SJunchao Zhang /* Are root indices for self and remote contiguous? */ 1722*cd620004SJunchao Zhang bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1723*cd620004SJunchao Zhang bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1724*cd620004SJunchao Zhang 1725*cd620004SJunchao Zhang if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1726*cd620004SJunchao Zhang if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1727*cd620004SJunchao Zhang 1728*cd620004SJunchao Zhang for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1729*cd620004SJunchao Zhang if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1730*cd620004SJunchao Zhang } 1731*cd620004SJunchao Zhang for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1732*cd620004SJunchao Zhang if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1733*cd620004SJunchao Zhang } 1734*cd620004SJunchao Zhang 1735*cd620004SJunchao Zhang if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1736*cd620004SJunchao Zhang if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1737*cd620004SJunchao Zhang 1738*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1739*cd620004SJunchao Zhang /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1740*cd620004SJunchao Zhang if (!sf->leafcontig[0]) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1741*cd620004SJunchao Zhang if (!sf->leafcontig[1]) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1742*cd620004SJunchao Zhang if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1743*cd620004SJunchao Zhang if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1744*cd620004SJunchao Zhang #endif 1745*cd620004SJunchao Zhang 1746*cd620004SJunchao Zhang PetscFunctionReturn(0); 1747*cd620004SJunchao Zhang } 1748*cd620004SJunchao Zhang 1749*cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1750*cd620004SJunchao Zhang { 1751*cd620004SJunchao Zhang PetscErrorCode ierr; 1752*cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1753*cd620004SJunchao Zhang PetscInt i; 1754*cd620004SJunchao Zhang 1755*cd620004SJunchao Zhang PetscFunctionBegin; 1756*cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1757*cd620004SJunchao Zhang ierr = PetscSFDestroyPackOpt(&sf->leafpackopt[i]);CHKERRQ(ierr); 1758*cd620004SJunchao Zhang ierr = PetscSFDestroyPackOpt(&bas->rootpackopt[i]);CHKERRQ(ierr); 1759*cd620004SJunchao Zhang } 1760*cd620004SJunchao Zhang PetscFunctionReturn(0); 1761*cd620004SJunchao Zhang } 1762