140e23c03SJunchao Zhang 240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 440e23c03SJunchao Zhang 5eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 6eb02082bSJunchao Zhang #include <cuda_runtime.h> 7eb02082bSJunchao Zhang #endif 840e23c03SJunchao Zhang /* 940e23c03SJunchao Zhang * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 1040e23c03SJunchao Zhang * therefore we pack data types manually. This file defines packing routines for the standard data types. 1140e23c03SJunchao Zhang */ 1240e23c03SJunchao Zhang 13cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 1440e23c03SJunchao Zhang 15cd620004SJunchao Zhang /* Operations working like s += t */ 16cd620004SJunchao Zhang #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 17cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 18cd620004SJunchao Zhang #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */ 19cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t) do {(s) = (t);} while(0) 20cd620004SJunchao Zhang /* Ref MPI MAXLOC */ 21cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \ 22cd620004SJunchao Zhang do { \ 23cd620004SJunchao Zhang if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 24cd620004SJunchao Zhang else if (!((s).u op (t).u)) s = t; \ 25cd620004SJunchao Zhang } while(0) 2640e23c03SJunchao Zhang 2740e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine 2840e23c03SJunchao Zhang 2940e23c03SJunchao Zhang Arguments of the macro: 30b23bfdefSJunchao Zhang +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 31*fcc7397dSJunchao Zhang .BS Block size for vectorization. It is a factor of bsz. 32b23bfdefSJunchao Zhang -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 3340e23c03SJunchao Zhang 3440e23c03SJunchao Zhang Arguments of the Pack routine: 35cd620004SJunchao Zhang +count Number of indices in idx[]. 36*fcc7397dSJunchao Zhang .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 37*fcc7397dSJunchao Zhang .opt Per-pack optimization plan. NULL means no such plan. 38*fcc7397dSJunchao Zhang .idx Indices of entries to packed. 39eb02082bSJunchao Zhang .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 40cd620004SJunchao Zhang .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 41cd620004SJunchao Zhang -packed Address of the packed data. 4240e23c03SJunchao Zhang */ 43b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \ 44*fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 45b23bfdefSJunchao Zhang { \ 4640e23c03SJunchao Zhang PetscErrorCode ierr; \ 47b23bfdefSJunchao Zhang const Type *u = (const Type*)unpacked,*u2; \ 48b23bfdefSJunchao Zhang Type *p = (Type*)packed,*p2; \ 49*fcc7397dSJunchao Zhang PetscInt i,j,k,X,Y,r,bs=link->bs; \ 50*fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 51b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 5240e23c03SJunchao Zhang PetscFunctionBegin; \ 53*fcc7397dSJunchao Zhang if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 54*fcc7397dSJunchao Zhang else if (opt) { /* has optimizations available */ \ 55*fcc7397dSJunchao Zhang p2 = p; \ 56*fcc7397dSJunchao Zhang for (r=0; r<opt->n; r++) { \ 57*fcc7397dSJunchao Zhang u2 = u + opt->start[r]*MBS; \ 58*fcc7397dSJunchao Zhang X = opt->X[r]; \ 59*fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 60*fcc7397dSJunchao Zhang for (k=0; k<opt->dz[r]; k++) \ 61*fcc7397dSJunchao Zhang for (j=0; j<opt->dy[r]; j++) { \ 62*fcc7397dSJunchao Zhang ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 63*fcc7397dSJunchao Zhang p2 += opt->dx[r]*MBS; \ 64*fcc7397dSJunchao Zhang } \ 65*fcc7397dSJunchao Zhang } \ 66*fcc7397dSJunchao Zhang } else { \ 67b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 68eb02082bSJunchao Zhang for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 69eb02082bSJunchao Zhang for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 70b23bfdefSJunchao Zhang p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 7140e23c03SJunchao Zhang } \ 7240e23c03SJunchao Zhang PetscFunctionReturn(0); \ 7340e23c03SJunchao Zhang } 7440e23c03SJunchao Zhang 75cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 76cd620004SJunchao Zhang and inserts into a sparse array. 7740e23c03SJunchao Zhang 7840e23c03SJunchao Zhang Arguments: 79b23bfdefSJunchao Zhang .Type Type of the data 8040e23c03SJunchao Zhang .BS Block size for vectorization 81b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 8240e23c03SJunchao Zhang 8340e23c03SJunchao Zhang Notes: 8440e23c03SJunchao Zhang This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 8540e23c03SJunchao Zhang */ 86cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ) \ 87*fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 88b23bfdefSJunchao Zhang { \ 8940e23c03SJunchao Zhang PetscErrorCode ierr; \ 90b23bfdefSJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 91*fcc7397dSJunchao Zhang const Type *p = (const Type*)packed; \ 92*fcc7397dSJunchao Zhang PetscInt i,j,k,X,Y,r,bs=link->bs; \ 93*fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 94b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 9540e23c03SJunchao Zhang PetscFunctionBegin; \ 96b23bfdefSJunchao Zhang if (!idx) { \ 97*fcc7397dSJunchao Zhang u += start*MBS; \ 98*fcc7397dSJunchao Zhang if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 99*fcc7397dSJunchao Zhang } else if (opt) { /* has optimizations available */ \ 100*fcc7397dSJunchao Zhang for (r=0; r<opt->n; r++) { \ 101*fcc7397dSJunchao Zhang u2 = u + opt->start[r]*MBS; \ 102*fcc7397dSJunchao Zhang X = opt->X[r]; \ 103*fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 104*fcc7397dSJunchao Zhang for (k=0; k<opt->dz[r]; k++) \ 105*fcc7397dSJunchao Zhang for (j=0; j<opt->dy[r]; j++) { \ 106*fcc7397dSJunchao Zhang ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 107*fcc7397dSJunchao Zhang p += opt->dx[r]*MBS; \ 108*fcc7397dSJunchao Zhang } \ 109*fcc7397dSJunchao Zhang } \ 110*fcc7397dSJunchao Zhang } else { \ 111b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 112b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 113cd620004SJunchao Zhang for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 11440e23c03SJunchao Zhang } \ 11540e23c03SJunchao Zhang PetscFunctionReturn(0); \ 11640e23c03SJunchao Zhang } 11740e23c03SJunchao Zhang 118cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 11940e23c03SJunchao Zhang 12040e23c03SJunchao Zhang Arguments: 121cd620004SJunchao Zhang +Opname Name of the Op, such as Add, Mult, LAND, etc. 122b23bfdefSJunchao Zhang .Type Type of the data 12340e23c03SJunchao Zhang .BS Block size for vectorization 124b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 125cd620004SJunchao Zhang .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 126cd620004SJunchao Zhang .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 12740e23c03SJunchao Zhang */ 128cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 129*fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 130b23bfdefSJunchao Zhang { \ 131cd620004SJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 132*fcc7397dSJunchao Zhang const Type *p = (const Type*)packed; \ 133*fcc7397dSJunchao Zhang PetscInt i,j,k,X,Y,r,bs=link->bs; \ 134*fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 135b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 13640e23c03SJunchao Zhang PetscFunctionBegin; \ 137b23bfdefSJunchao Zhang if (!idx) { \ 138*fcc7397dSJunchao Zhang u += start*MBS; \ 139cd620004SJunchao Zhang for (i=0; i<count; i++) \ 140cd620004SJunchao Zhang for (j=0; j<M; j++) \ 141cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 142cd620004SJunchao Zhang OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 143*fcc7397dSJunchao Zhang } else if (opt) { /* idx[] has patterns */ \ 144*fcc7397dSJunchao Zhang for (r=0; r<opt->n; r++) { \ 145*fcc7397dSJunchao Zhang u2 = u + opt->start[r]*MBS; \ 146*fcc7397dSJunchao Zhang X = opt->X[r]; \ 147*fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 148*fcc7397dSJunchao Zhang for (k=0; k<opt->dz[r]; k++) \ 149*fcc7397dSJunchao Zhang for (j=0; j<opt->dy[r]; j++) { \ 150*fcc7397dSJunchao Zhang for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 151*fcc7397dSJunchao Zhang p += opt->dx[r]*MBS; \ 152*fcc7397dSJunchao Zhang } \ 153*fcc7397dSJunchao Zhang } \ 154*fcc7397dSJunchao Zhang } else { \ 155cd620004SJunchao Zhang for (i=0; i<count; i++) \ 156cd620004SJunchao Zhang for (j=0; j<M; j++) \ 157cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 158cd620004SJunchao Zhang OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 159cd620004SJunchao Zhang } \ 160cd620004SJunchao Zhang PetscFunctionReturn(0); \ 161cd620004SJunchao Zhang } 162cd620004SJunchao Zhang 163cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 164*fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 165cd620004SJunchao Zhang { \ 166*fcc7397dSJunchao Zhang Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 167*fcc7397dSJunchao Zhang PetscInt i,j,k,r,l,bs=link->bs; \ 168*fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 169*fcc7397dSJunchao Zhang const PetscInt MBS = M*BS; \ 170cd620004SJunchao Zhang PetscFunctionBegin; \ 171*fcc7397dSJunchao Zhang for (i=0; i<count; i++) { \ 172*fcc7397dSJunchao Zhang r = (!idx ? start+i : idx[i])*MBS; \ 173*fcc7397dSJunchao Zhang l = i*MBS; \ 174b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 175b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 176*fcc7397dSJunchao Zhang tmp = u[r+j*BS+k]; \ 177*fcc7397dSJunchao Zhang OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 178*fcc7397dSJunchao Zhang p[l+j*BS+k] = tmp; \ 179cd620004SJunchao Zhang } \ 180cd620004SJunchao Zhang } \ 181cd620004SJunchao Zhang PetscFunctionReturn(0); \ 182cd620004SJunchao Zhang } 183cd620004SJunchao Zhang 184cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 185*fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 186cd620004SJunchao Zhang { \ 187*fcc7397dSJunchao Zhang PetscErrorCode ierr; \ 188*fcc7397dSJunchao Zhang const Type *u = (const Type*)src; \ 189*fcc7397dSJunchao Zhang Type *v = (Type*)dst; \ 190*fcc7397dSJunchao Zhang PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 191cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 192cd620004SJunchao Zhang const PetscInt MBS = M*BS; \ 193cd620004SJunchao Zhang PetscFunctionBegin; \ 194*fcc7397dSJunchao Zhang if (!srcIdx) { /* src is contiguous */ \ 195*fcc7397dSJunchao Zhang u += srcStart*MBS; \ 196*fcc7397dSJunchao Zhang ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 197*fcc7397dSJunchao Zhang } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 198*fcc7397dSJunchao Zhang u += srcOpt->start[0]*MBS; \ 199*fcc7397dSJunchao Zhang v += dstStart*MBS; \ 200*fcc7397dSJunchao Zhang X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 201*fcc7397dSJunchao Zhang for (k=0; k<srcOpt->dz[0]; k++) \ 202*fcc7397dSJunchao Zhang for (j=0; j<srcOpt->dy[0]; j++) { \ 203*fcc7397dSJunchao Zhang for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 204*fcc7397dSJunchao Zhang v += srcOpt->dx[0]*MBS; \ 205*fcc7397dSJunchao Zhang } \ 206*fcc7397dSJunchao Zhang } else { /* all other cases */ \ 207*fcc7397dSJunchao Zhang for (i=0; i<count; i++) { \ 208*fcc7397dSJunchao Zhang s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 209*fcc7397dSJunchao Zhang t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 210cd620004SJunchao Zhang for (j=0; j<M; j++) \ 211*fcc7397dSJunchao Zhang for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 212*fcc7397dSJunchao Zhang } \ 213cd620004SJunchao Zhang } \ 214cd620004SJunchao Zhang PetscFunctionReturn(0); \ 215cd620004SJunchao Zhang } 216cd620004SJunchao Zhang 217cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 218*fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 219cd620004SJunchao Zhang { \ 220*fcc7397dSJunchao Zhang Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 221*fcc7397dSJunchao Zhang const Type *ldata = (const Type*)leafdata; \ 222*fcc7397dSJunchao Zhang PetscInt i,j,k,r,l,bs = link->bs; \ 223cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 224cd620004SJunchao Zhang const PetscInt MBS = M*BS; \ 225cd620004SJunchao Zhang PetscFunctionBegin; \ 226*fcc7397dSJunchao Zhang for (i=0; i<count; i++) { \ 227*fcc7397dSJunchao Zhang r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 228*fcc7397dSJunchao Zhang l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 229cd620004SJunchao Zhang for (j=0; j<M; j++) \ 230cd620004SJunchao Zhang for (k=0; k<BS; k++) { \ 231*fcc7397dSJunchao Zhang lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 232*fcc7397dSJunchao Zhang OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 23340e23c03SJunchao Zhang } \ 23440e23c03SJunchao Zhang } \ 23540e23c03SJunchao Zhang PetscFunctionReturn(0); \ 23640e23c03SJunchao Zhang } 23740e23c03SJunchao Zhang 238b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */ 239b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ) \ 240b23bfdefSJunchao Zhang DEF_PackFunc(Type,BS,EQ) \ 241cd620004SJunchao Zhang DEF_UnpackFunc(Type,BS,EQ) \ 242cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 243cd620004SJunchao Zhang static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 244eb02082bSJunchao Zhang link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 245eb02082bSJunchao Zhang link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 246cd620004SJunchao Zhang link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 24740e23c03SJunchao Zhang } 24840e23c03SJunchao Zhang 249b23bfdefSJunchao Zhang /* Add, Mult ops */ 250b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ) \ 251cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 252cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 253cd620004SJunchao Zhang DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 254cd620004SJunchao Zhang DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 255cd620004SJunchao Zhang DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 256cd620004SJunchao Zhang DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 257cd620004SJunchao Zhang static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 258eb02082bSJunchao Zhang link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 259eb02082bSJunchao Zhang link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 260eb02082bSJunchao Zhang link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 261cd620004SJunchao Zhang link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 262cd620004SJunchao Zhang link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 263cd620004SJunchao Zhang link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 26440e23c03SJunchao Zhang } 26540e23c03SJunchao Zhang 266b23bfdefSJunchao Zhang /* Max, Min ops */ 267b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ) \ 268cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 269cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 270cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 271cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 272cd620004SJunchao Zhang static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 273eb02082bSJunchao Zhang link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 274eb02082bSJunchao Zhang link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 275cd620004SJunchao Zhang link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 276cd620004SJunchao Zhang link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 277b23bfdefSJunchao Zhang } 278b23bfdefSJunchao Zhang 279b23bfdefSJunchao Zhang /* Logical ops. 280cd620004SJunchao Zhang The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 28140e23c03SJunchao Zhang the compilation warning "empty macro arguments are undefined in ISO C90" 28240e23c03SJunchao Zhang */ 283b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ) \ 284cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 285cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 286cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 287cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 288cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 289cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 290cd620004SJunchao Zhang static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 291eb02082bSJunchao Zhang link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 292eb02082bSJunchao Zhang link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 293eb02082bSJunchao Zhang link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 294cd620004SJunchao Zhang link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 295cd620004SJunchao Zhang link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 296cd620004SJunchao Zhang link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 29740e23c03SJunchao Zhang } 29840e23c03SJunchao Zhang 299b23bfdefSJunchao Zhang /* Bitwise ops */ 300b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ) \ 301cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 302cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 303cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 304cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 305cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 306cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 307cd620004SJunchao Zhang static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 308eb02082bSJunchao Zhang link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 309eb02082bSJunchao Zhang link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 310eb02082bSJunchao Zhang link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 311cd620004SJunchao Zhang link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 312cd620004SJunchao Zhang link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 313cd620004SJunchao Zhang link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 31440e23c03SJunchao Zhang } 31540e23c03SJunchao Zhang 316cd620004SJunchao Zhang /* Maxloc, Minloc ops */ 317cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ) \ 318cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 319cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 320cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 321cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 322cd620004SJunchao Zhang static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 323cd620004SJunchao Zhang link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 324cd620004SJunchao Zhang link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 325cd620004SJunchao Zhang link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 326cd620004SJunchao Zhang link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 32740e23c03SJunchao Zhang } 32840e23c03SJunchao Zhang 329b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ) \ 330b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 331b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 332b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 333b23bfdefSJunchao Zhang DEF_Log(Type,BS,EQ) \ 334b23bfdefSJunchao Zhang DEF_Bit(Type,BS,EQ) \ 335cd620004SJunchao Zhang static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 336b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 337b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 338b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 339b23bfdefSJunchao Zhang CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 340b23bfdefSJunchao Zhang CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 34140e23c03SJunchao Zhang } 34240e23c03SJunchao Zhang 343b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ) \ 344b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 345b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 346b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 347cd620004SJunchao Zhang static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 348b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 349b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 350b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 351b23bfdefSJunchao Zhang } 35240e23c03SJunchao Zhang 35340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 354b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ) \ 355b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 356b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 357cd620004SJunchao Zhang static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 358b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 359b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 360b23bfdefSJunchao Zhang } 36140e23c03SJunchao Zhang #endif 362b23bfdefSJunchao Zhang 363b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ) \ 364b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 365cd620004SJunchao Zhang static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 366b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 367b23bfdefSJunchao Zhang } 368b23bfdefSJunchao Zhang 369b23bfdefSJunchao Zhang /* Maxloc, Minloc */ 370cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ) \ 371cd620004SJunchao Zhang DEF_Pack(Type,BS,EQ) \ 372cd620004SJunchao Zhang DEF_Xloc(Type,BS,EQ) \ 373cd620004SJunchao Zhang static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 374cd620004SJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 375cd620004SJunchao Zhang CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 376b23bfdefSJunchao Zhang } 377b23bfdefSJunchao Zhang 378b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 379b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 380b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 381b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 382b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 383b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 384b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 385b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 386b23bfdefSJunchao Zhang 387b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 388b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1) 389b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1) 390b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1) 391b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1) 392b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0) 393b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0) 394b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0) 395b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0) 396b23bfdefSJunchao Zhang #endif 397b23bfdefSJunchao Zhang 398b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */ 399b23bfdefSJunchao Zhang typedef signed char SignedChar; 400b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1) 401b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1) 402b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1) 403b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1) 404b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0) 405b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0) 406b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0) 407b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0) 408b23bfdefSJunchao Zhang 409b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar; 410b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1) 411b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1) 412b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1) 413b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1) 414b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0) 415b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0) 416b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0) 417b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0) 418b23bfdefSJunchao Zhang 419b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1) 420b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1) 421b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1) 422b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1) 423b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0) 424b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0) 425b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0) 426b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0) 427b23bfdefSJunchao Zhang 428b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 429b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1) 430b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1) 431b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1) 432b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1) 433b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0) 434b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0) 435b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0) 436b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0) 437b23bfdefSJunchao Zhang #endif 438b23bfdefSJunchao Zhang 439cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2 440cd620004SJunchao Zhang typedef struct {int u; int i;} PairType(int,int); 441cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 442cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1) 443cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1) 444b23bfdefSJunchao Zhang 445b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */ 446b23bfdefSJunchao Zhang DEF_DumbType(char,1,1) 447b23bfdefSJunchao Zhang DEF_DumbType(char,2,1) 448b23bfdefSJunchao Zhang DEF_DumbType(char,4,1) 449b23bfdefSJunchao Zhang DEF_DumbType(char,1,0) 450b23bfdefSJunchao Zhang DEF_DumbType(char,2,0) 451b23bfdefSJunchao Zhang DEF_DumbType(char,4,0) 452b23bfdefSJunchao Zhang 453eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 454b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1) 455b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1) 456b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1) 457b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1) 458b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0) 459b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0) 460b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0) 461b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0) 46240e23c03SJunchao Zhang 46340e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 46440e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 46540e23c03SJunchao Zhang { 46640e23c03SJunchao Zhang int ierr; 46740e23c03SJunchao Zhang ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 46840e23c03SJunchao Zhang ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 46940e23c03SJunchao Zhang return MPI_SUCCESS; 47040e23c03SJunchao Zhang } 47140e23c03SJunchao Zhang #endif 47240e23c03SJunchao Zhang 473cd620004SJunchao Zhang /* 474cd620004SJunchao Zhang The routine Creates a communication link for the given operation. It first looks up its link cache. If 475cd620004SJunchao Zhang there is a free & suitable one, it uses it. Otherwise it creates a new one. 476cd620004SJunchao Zhang 477cd620004SJunchao Zhang A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 478cd620004SJunchao Zhang root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 479cd620004SJunchao Zhang can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 480cd620004SJunchao Zhang those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 481cd620004SJunchao Zhang 482cd620004SJunchao Zhang The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 483cd620004SJunchao Zhang 484cd620004SJunchao Zhang In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 485cd620004SJunchao Zhang 486cd620004SJunchao Zhang The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 487cd620004SJunchao Zhang need pack/unpack data. 488cd620004SJunchao Zhang */ 489cd620004SJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 49040e23c03SJunchao Zhang { 49140e23c03SJunchao Zhang PetscErrorCode ierr; 492cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 493cd620004SJunchao Zhang PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 494cd620004SJunchao Zhang PetscSFLink *p,link; 495cd620004SJunchao Zhang PetscSFDirection direction; 496cd620004SJunchao Zhang MPI_Request *reqs = NULL; 497cd620004SJunchao Zhang PetscBool match,rootdirect[2],leafdirect[2]; 498cd620004SJunchao Zhang PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 499cd620004SJunchao Zhang PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 500cd620004SJunchao Zhang 501cd620004SJunchao Zhang PetscFunctionBegin; 502cd620004SJunchao Zhang ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 503cd620004SJunchao Zhang 504cd620004SJunchao Zhang /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 505cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 506cd620004SJunchao Zhang if (sfop == PETSCSF_BCAST) { 507cd620004SJunchao Zhang rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 508cd620004SJunchao Zhang leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 509cd620004SJunchao Zhang } else if (sfop == PETSCSF_REDUCE) { 510cd620004SJunchao Zhang leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 511cd620004SJunchao Zhang rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 512cd620004SJunchao Zhang } else { /* PETSCSF_FETCH */ 513cd620004SJunchao Zhang rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 514cd620004SJunchao Zhang leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 515cd620004SJunchao Zhang } 516cd620004SJunchao Zhang } 517cd620004SJunchao Zhang 518cd620004SJunchao Zhang if (use_gpu_aware_mpi) { 519cd620004SJunchao Zhang rootmtype_mpi = rootmtype; 520cd620004SJunchao Zhang leafmtype_mpi = leafmtype; 521cd620004SJunchao Zhang } else { 522cd620004SJunchao Zhang rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 523cd620004SJunchao Zhang } 524cd620004SJunchao Zhang /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 525cd620004SJunchao Zhang rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 526cd620004SJunchao Zhang leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 527cd620004SJunchao Zhang 528cd620004SJunchao Zhang direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 529cd620004SJunchao Zhang nrootreqs = bas->nrootreqs; 530cd620004SJunchao Zhang nleafreqs = sf->nleafreqs; 531cd620004SJunchao Zhang 532cd620004SJunchao Zhang /* Look for free links in cache */ 533cd620004SJunchao Zhang for (p=&bas->avail; (link=*p); p=&link->next) { 534cd620004SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 535cd620004SJunchao Zhang if (match) { 536cd620004SJunchao Zhang /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. 537cd620004SJunchao Zhang If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 538cd620004SJunchao Zhang */ 539cd620004SJunchao Zhang if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 540cd620004SJunchao Zhang reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 541cd620004SJunchao Zhang for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 542cd620004SJunchao Zhang link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 543cd620004SJunchao Zhang } 544cd620004SJunchao Zhang if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 545cd620004SJunchao Zhang reqs = link->leafreqs[direction][leafmtype][1]; 546cd620004SJunchao Zhang for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 547cd620004SJunchao Zhang link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 548cd620004SJunchao Zhang } 549cd620004SJunchao Zhang *p = link->next; /* Remove from available list */ 550cd620004SJunchao Zhang goto found; 551cd620004SJunchao Zhang } 552cd620004SJunchao Zhang } 553cd620004SJunchao Zhang 554cd620004SJunchao Zhang ierr = PetscNew(&link);CHKERRQ(ierr); 555cd620004SJunchao Zhang ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 556cd620004SJunchao Zhang ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 557cd620004SJunchao Zhang 558cd620004SJunchao Zhang nreqs = (nrootreqs+nleafreqs)*8; 559cd620004SJunchao Zhang ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 560cd620004SJunchao Zhang for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 561cd620004SJunchao Zhang 562cd620004SJunchao Zhang for (i=0; i<2; i++) { /* Two communication directions */ 563cd620004SJunchao Zhang for (j=0; j<2; j++) { /* Two memory types */ 564cd620004SJunchao Zhang for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 565cd620004SJunchao Zhang link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 566cd620004SJunchao Zhang link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 567cd620004SJunchao Zhang } 568cd620004SJunchao Zhang } 569cd620004SJunchao Zhang } 570cd620004SJunchao Zhang 571cd620004SJunchao Zhang found: 572cd620004SJunchao Zhang if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);} 573cd620004SJunchao Zhang 574cd620004SJunchao Zhang /* Allocate buffers along root/leafdata */ 575cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 576cd620004SJunchao Zhang /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 577cd620004SJunchao Zhang if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 578cd620004SJunchao Zhang if (bas->rootbuflen[i]) { 579cd620004SJunchao Zhang if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 580cd620004SJunchao Zhang link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 581cd620004SJunchao Zhang } else { /* Have to have a separate rootbuf */ 582cd620004SJunchao Zhang if (!link->rootbuf_alloc[i][rootmtype]) { 583cd620004SJunchao Zhang ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 584cd620004SJunchao Zhang } 585cd620004SJunchao Zhang link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 586cd620004SJunchao Zhang } 587cd620004SJunchao Zhang } 588cd620004SJunchao Zhang 589cd620004SJunchao Zhang if (sf->leafbuflen[i]) { 590cd620004SJunchao Zhang if (leafdirect[i]) { 591cd620004SJunchao Zhang link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 592cd620004SJunchao Zhang } else { 593cd620004SJunchao Zhang if (!link->leafbuf_alloc[i][leafmtype]) { 594cd620004SJunchao Zhang ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 595cd620004SJunchao Zhang } 596cd620004SJunchao Zhang link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 597cd620004SJunchao Zhang } 598cd620004SJunchao Zhang } 599cd620004SJunchao Zhang } 600cd620004SJunchao Zhang 601cd620004SJunchao Zhang /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 602cd620004SJunchao Zhang if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { 603cd620004SJunchao Zhang if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 604cd620004SJunchao Zhang ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 605cd620004SJunchao Zhang } 606cd620004SJunchao Zhang link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 607cd620004SJunchao Zhang } 608cd620004SJunchao Zhang if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { 609cd620004SJunchao Zhang if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 610cd620004SJunchao Zhang ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 611cd620004SJunchao Zhang } 612cd620004SJunchao Zhang link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 613cd620004SJunchao Zhang } 614cd620004SJunchao Zhang 615cd620004SJunchao Zhang /* Set `current` state of the link. They may change between different SF invocations with the same link */ 616cd620004SJunchao Zhang if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 617cd620004SJunchao Zhang if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 618cd620004SJunchao Zhang if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 619cd620004SJunchao Zhang } 620cd620004SJunchao Zhang 621cd620004SJunchao Zhang link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 622cd620004SJunchao Zhang link->leafdata = leafdata; 623cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 624cd620004SJunchao Zhang link->rootdirect[i] = rootdirect[i]; 625cd620004SJunchao Zhang link->leafdirect[i] = leafdirect[i]; 626cd620004SJunchao Zhang } 627cd620004SJunchao Zhang link->rootdirect_mpi = rootdirect_mpi; 628cd620004SJunchao Zhang link->leafdirect_mpi = leafdirect_mpi; 629cd620004SJunchao Zhang link->rootmtype = rootmtype; 630cd620004SJunchao Zhang link->leafmtype = leafmtype; 631cd620004SJunchao Zhang link->rootmtype_mpi = rootmtype_mpi; 632cd620004SJunchao Zhang link->leafmtype_mpi = leafmtype_mpi; 633cd620004SJunchao Zhang 634cd620004SJunchao Zhang link->next = bas->inuse; 635cd620004SJunchao Zhang bas->inuse = link; 636cd620004SJunchao Zhang *mylink = link; 637cd620004SJunchao Zhang PetscFunctionReturn(0); 638cd620004SJunchao Zhang } 639cd620004SJunchao Zhang 640cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 641cd620004SJunchao Zhang If the sf uses persistent requests and the requests have not been initialized, then initialize them. 642cd620004SJunchao Zhang */ 643cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 644cd620004SJunchao Zhang { 645cd620004SJunchao Zhang PetscErrorCode ierr; 646cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 647cd620004SJunchao Zhang PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 648cd620004SJunchao Zhang const PetscInt *rootoffset,*leafoffset; 649cd620004SJunchao Zhang PetscMPIInt n; 650cd620004SJunchao Zhang MPI_Aint disp; 651cd620004SJunchao Zhang MPI_Comm comm = PetscObjectComm((PetscObject)sf); 652cd620004SJunchao Zhang MPI_Datatype unit = link->unit; 653cd620004SJunchao Zhang const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 654cd620004SJunchao Zhang const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 655cd620004SJunchao Zhang 656cd620004SJunchao Zhang PetscFunctionBegin; 657cd620004SJunchao Zhang /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 658cd620004SJunchao Zhang if (sf->persistent) { 659cd620004SJunchao Zhang if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 660cd620004SJunchao Zhang ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 661cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 662cd620004SJunchao Zhang for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 663cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 664cd620004SJunchao Zhang ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 665cd620004SJunchao Zhang ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 666cd620004SJunchao Zhang } 667cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 668cd620004SJunchao Zhang for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 669cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 670cd620004SJunchao Zhang ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 671cd620004SJunchao Zhang ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 672cd620004SJunchao Zhang } 673cd620004SJunchao Zhang } 674cd620004SJunchao Zhang link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 675cd620004SJunchao Zhang } 676cd620004SJunchao Zhang 677cd620004SJunchao Zhang if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 678cd620004SJunchao Zhang ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 679cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 680cd620004SJunchao Zhang for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 681cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 682cd620004SJunchao Zhang ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 683cd620004SJunchao Zhang ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 684cd620004SJunchao Zhang } 685cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 686cd620004SJunchao Zhang for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 687cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 688cd620004SJunchao Zhang ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 689cd620004SJunchao Zhang ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 690cd620004SJunchao Zhang } 691cd620004SJunchao Zhang } 692cd620004SJunchao Zhang link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 693cd620004SJunchao Zhang } 694cd620004SJunchao Zhang } 695cd620004SJunchao Zhang if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 696cd620004SJunchao Zhang if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 697cd620004SJunchao Zhang if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 698cd620004SJunchao Zhang if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 699cd620004SJunchao Zhang PetscFunctionReturn(0); 700cd620004SJunchao Zhang } 701cd620004SJunchao Zhang 702cd620004SJunchao Zhang 703cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 704cd620004SJunchao Zhang { 705cd620004SJunchao Zhang PetscErrorCode ierr; 706cd620004SJunchao Zhang PetscSFLink link,*p; 70740e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 70840e23c03SJunchao Zhang 70940e23c03SJunchao Zhang PetscFunctionBegin; 71040e23c03SJunchao Zhang /* Look for types in cache */ 71140e23c03SJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 71240e23c03SJunchao Zhang PetscBool match; 71340e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 714637e6665SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 71540e23c03SJunchao Zhang switch (cmode) { 71640e23c03SJunchao Zhang case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 71740e23c03SJunchao Zhang case PETSC_USE_POINTER: break; 71840e23c03SJunchao Zhang default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 71940e23c03SJunchao Zhang } 72040e23c03SJunchao Zhang *mylink = link; 72140e23c03SJunchao Zhang PetscFunctionReturn(0); 72240e23c03SJunchao Zhang } 72340e23c03SJunchao Zhang } 72440e23c03SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 72540e23c03SJunchao Zhang PetscFunctionReturn(0); 72640e23c03SJunchao Zhang } 72740e23c03SJunchao Zhang 728cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) 72940e23c03SJunchao Zhang { 73040e23c03SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 73140e23c03SJunchao Zhang 73240e23c03SJunchao Zhang PetscFunctionBegin; 733637e6665SJunchao Zhang (*link)->rootdata = NULL; 734637e6665SJunchao Zhang (*link)->leafdata = NULL; 73540e23c03SJunchao Zhang (*link)->next = bas->avail; 73640e23c03SJunchao Zhang bas->avail = *link; 73740e23c03SJunchao Zhang *link = NULL; 73840e23c03SJunchao Zhang PetscFunctionReturn(0); 73940e23c03SJunchao Zhang } 74040e23c03SJunchao Zhang 741cd620004SJunchao Zhang /* Destroy all links chained in 'avail' */ 742cd620004SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) 743eb02082bSJunchao Zhang { 744eb02082bSJunchao Zhang PetscErrorCode ierr; 745cd620004SJunchao Zhang PetscSFLink link = *avail,next; 746cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 747cd620004SJunchao Zhang PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 748eb02082bSJunchao Zhang 749eb02082bSJunchao Zhang PetscFunctionBegin; 750eb02082bSJunchao Zhang for (; link; link=next) { 751eb02082bSJunchao Zhang next = link->next; 752eb02082bSJunchao Zhang if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 753cd620004SJunchao Zhang for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 754eb02082bSJunchao Zhang if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 755eb02082bSJunchao Zhang } 756eb02082bSJunchao Zhang ierr = PetscFree(link->reqs);CHKERRQ(ierr); 757cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 75851ccb202SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 759cd620004SJunchao Zhang ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 760cd620004SJunchao Zhang ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 761eb02082bSJunchao Zhang if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;} 762eb02082bSJunchao Zhang #endif 763cd620004SJunchao Zhang ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 764cd620004SJunchao Zhang ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 765cd620004SJunchao Zhang } 766eb02082bSJunchao Zhang ierr = PetscFree(link);CHKERRQ(ierr); 767eb02082bSJunchao Zhang } 768eb02082bSJunchao Zhang *avail = NULL; 769eb02082bSJunchao Zhang PetscFunctionReturn(0); 770eb02082bSJunchao Zhang } 771eb02082bSJunchao Zhang 772cd620004SJunchao Zhang #if defined(PETSC_USE_DEBUG) 7739d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */ 774cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 7759d1c8addSJunchao Zhang { 7769d1c8addSJunchao Zhang PetscErrorCode ierr; 777cd620004SJunchao Zhang PetscSFLink link,*p; 7789d1c8addSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 7799d1c8addSJunchao Zhang PetscBool match; 7809d1c8addSJunchao Zhang 7819d1c8addSJunchao Zhang PetscFunctionBegin; 78218fb5014SJunchao Zhang /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 78318fb5014SJunchao Zhang the potential overlapping since this process does not participate in communication. Overlapping is harmless. 78418fb5014SJunchao Zhang */ 785637e6665SJunchao Zhang if (rootdata || leafdata) { 7869d1c8addSJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 7879d1c8addSJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 788cd620004SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 7899d1c8addSJunchao Zhang } 79018fb5014SJunchao Zhang } 7919d1c8addSJunchao Zhang PetscFunctionReturn(0); 7929d1c8addSJunchao Zhang } 793cd620004SJunchao Zhang #endif 7949d1c8addSJunchao Zhang 795cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 79640e23c03SJunchao Zhang { 79740e23c03SJunchao Zhang PetscErrorCode ierr; 798b23bfdefSJunchao Zhang PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 799b23bfdefSJunchao Zhang PetscBool is2Int,is2PetscInt; 80040e23c03SJunchao Zhang PetscMPIInt ni,na,nd,combiner; 80140e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 802b23bfdefSJunchao Zhang PetscInt nPetscComplex=0; 80340e23c03SJunchao Zhang #endif 80440e23c03SJunchao Zhang 80540e23c03SJunchao Zhang PetscFunctionBegin; 806b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 807b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 808b23bfdefSJunchao Zhang /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 809b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 810b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 811b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 81240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 813b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 81440e23c03SJunchao Zhang #endif 81540e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 81640e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 817b23bfdefSJunchao Zhang /* TODO: shaell we also handle Fortran MPI_2REAL? */ 81840e23c03SJunchao Zhang ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 8195ad15460SJunchao Zhang link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 820b23bfdefSJunchao Zhang link->bs = 1; /* default */ 82140e23c03SJunchao Zhang 822eb02082bSJunchao Zhang if (is2Int) { 823cd620004SJunchao Zhang PackInit_PairType_int_int_1_1(link); 824eb02082bSJunchao Zhang link->bs = 1; 825eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(int); 8265ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 827eb02082bSJunchao Zhang link->basicunit = MPI_2INT; 8285ad15460SJunchao Zhang link->unit = MPI_2INT; 829eb02082bSJunchao Zhang } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 830cd620004SJunchao Zhang PackInit_PairType_PetscInt_PetscInt_1_1(link); 831eb02082bSJunchao Zhang link->bs = 1; 832eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(PetscInt); 833eb02082bSJunchao Zhang link->basicunit = MPIU_2INT; 8345ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 8355ad15460SJunchao Zhang link->unit = MPIU_2INT; 836eb02082bSJunchao Zhang } else if (nPetscReal) { 837b23bfdefSJunchao Zhang if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 838b23bfdefSJunchao Zhang else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 839b23bfdefSJunchao Zhang else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 840b23bfdefSJunchao Zhang else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 841b23bfdefSJunchao Zhang link->bs = nPetscReal; 842eb02082bSJunchao Zhang link->unitbytes = nPetscReal*sizeof(PetscReal); 84340e23c03SJunchao Zhang link->basicunit = MPIU_REAL; 8445ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 845b23bfdefSJunchao Zhang } else if (nPetscInt) { 846b23bfdefSJunchao Zhang if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 847b23bfdefSJunchao Zhang else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 848b23bfdefSJunchao Zhang else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 849b23bfdefSJunchao Zhang else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 850b23bfdefSJunchao Zhang link->bs = nPetscInt; 851eb02082bSJunchao Zhang link->unitbytes = nPetscInt*sizeof(PetscInt); 852b23bfdefSJunchao Zhang link->basicunit = MPIU_INT; 8535ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 854b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) 855b23bfdefSJunchao Zhang } else if (nInt) { 856b23bfdefSJunchao Zhang if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 857b23bfdefSJunchao Zhang else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 858b23bfdefSJunchao Zhang else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 859b23bfdefSJunchao Zhang else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 860b23bfdefSJunchao Zhang link->bs = nInt; 861eb02082bSJunchao Zhang link->unitbytes = nInt*sizeof(int); 862b23bfdefSJunchao Zhang link->basicunit = MPI_INT; 8635ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 864b23bfdefSJunchao Zhang #endif 865b23bfdefSJunchao Zhang } else if (nSignedChar) { 866b23bfdefSJunchao Zhang if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 867b23bfdefSJunchao Zhang else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 868b23bfdefSJunchao Zhang else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 869b23bfdefSJunchao Zhang else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 870b23bfdefSJunchao Zhang link->bs = nSignedChar; 871eb02082bSJunchao Zhang link->unitbytes = nSignedChar*sizeof(SignedChar); 872b23bfdefSJunchao Zhang link->basicunit = MPI_SIGNED_CHAR; 8735ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 874b23bfdefSJunchao Zhang } else if (nUnsignedChar) { 875b23bfdefSJunchao Zhang if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 876b23bfdefSJunchao Zhang else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 877b23bfdefSJunchao Zhang else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 878b23bfdefSJunchao Zhang else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 879b23bfdefSJunchao Zhang link->bs = nUnsignedChar; 880eb02082bSJunchao Zhang link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 881b23bfdefSJunchao Zhang link->basicunit = MPI_UNSIGNED_CHAR; 8825ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 88340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 884b23bfdefSJunchao Zhang } else if (nPetscComplex) { 885b23bfdefSJunchao Zhang if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 886b23bfdefSJunchao Zhang else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 887b23bfdefSJunchao Zhang else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 888b23bfdefSJunchao Zhang else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 889b23bfdefSJunchao Zhang link->bs = nPetscComplex; 890eb02082bSJunchao Zhang link->unitbytes = nPetscComplex*sizeof(PetscComplex); 89140e23c03SJunchao Zhang link->basicunit = MPIU_COMPLEX; 8925ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 89340e23c03SJunchao Zhang #endif 89440e23c03SJunchao Zhang } else { 895b23bfdefSJunchao Zhang MPI_Aint lb,nbyte; 896b23bfdefSJunchao Zhang ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 89740e23c03SJunchao Zhang if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 898eb02082bSJunchao Zhang if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 899eb02082bSJunchao Zhang if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 900eb02082bSJunchao Zhang else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 901eb02082bSJunchao Zhang else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 902eb02082bSJunchao Zhang link->bs = nbyte; 903b23bfdefSJunchao Zhang link->unitbytes = nbyte; 904b23bfdefSJunchao Zhang link->basicunit = MPI_BYTE; 90540e23c03SJunchao Zhang } else { 906eb02082bSJunchao Zhang nInt = nbyte / sizeof(int); 907eb02082bSJunchao Zhang if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 908eb02082bSJunchao Zhang else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 909eb02082bSJunchao Zhang else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 910eb02082bSJunchao Zhang else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 911eb02082bSJunchao Zhang link->bs = nInt; 912b23bfdefSJunchao Zhang link->unitbytes = nbyte; 91340e23c03SJunchao Zhang link->basicunit = MPI_INT; 91440e23c03SJunchao Zhang } 9155ad15460SJunchao Zhang if (link->isbuiltin) link->unit = unit; 91640e23c03SJunchao Zhang } 917b23bfdefSJunchao Zhang 9185ad15460SJunchao Zhang if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 91940e23c03SJunchao Zhang PetscFunctionReturn(0); 92040e23c03SJunchao Zhang } 92140e23c03SJunchao Zhang 922*fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 92340e23c03SJunchao Zhang { 92440e23c03SJunchao Zhang PetscFunctionBegin; 92540e23c03SJunchao Zhang *UnpackAndOp = NULL; 926eb02082bSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) { 927eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 928eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 929eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 930eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 931eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 932eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 933eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 934eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 935eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 936eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 937eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 938eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 939eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 940eb02082bSJunchao Zhang } 941eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 942eb02082bSJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 943eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 944eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 945eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 946eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 947eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 948eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 949eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 950eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 951eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 952eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 953eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 954eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 955eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 956eb02082bSJunchao Zhang } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 957eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 958eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 959eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 960eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 961eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 962eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 963eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 964eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 965eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 966eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 967eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 968eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 969eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 970eb02082bSJunchao Zhang } 971eb02082bSJunchao Zhang #endif 97240e23c03SJunchao Zhang PetscFunctionReturn(0); 97340e23c03SJunchao Zhang } 97440e23c03SJunchao Zhang 975*fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 97640e23c03SJunchao Zhang { 97740e23c03SJunchao Zhang PetscFunctionBegin; 978cd620004SJunchao Zhang *ScatterAndOp = NULL; 979eb02082bSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) { 980cd620004SJunchao Zhang if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 981cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 982cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 983cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 984cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 985cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 986cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 987cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 988cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 989cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 990cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 991cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 992cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 993eb02082bSJunchao Zhang } 994eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 995eb02082bSJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 996cd620004SJunchao Zhang if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 997cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 998cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 999cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 1000cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 1001cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 1002cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 1003cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 1004cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 1005cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 1006cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 1007cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 1008cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 1009eb02082bSJunchao Zhang } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1010cd620004SJunchao Zhang if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 1011cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 1012cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 1013cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 1014cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 1015cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 1016cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 1017cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 1018cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 1019cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 1020cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 1021cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 1022cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 1023eb02082bSJunchao Zhang } 1024eb02082bSJunchao Zhang #endif 1025cd620004SJunchao Zhang PetscFunctionReturn(0); 1026cd620004SJunchao Zhang } 1027cd620004SJunchao Zhang 1028*fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 1029cd620004SJunchao Zhang { 1030cd620004SJunchao Zhang PetscFunctionBegin; 1031cd620004SJunchao Zhang *FetchAndOp = NULL; 1032cd620004SJunchao Zhang if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1033cd620004SJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; 1034cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1035cd620004SJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; 1036cd620004SJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; 1037cd620004SJunchao Zhang #endif 1038cd620004SJunchao Zhang PetscFunctionReturn(0); 1039cd620004SJunchao Zhang } 1040cd620004SJunchao Zhang 1041*fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 1042cd620004SJunchao Zhang { 1043cd620004SJunchao Zhang PetscFunctionBegin; 1044cd620004SJunchao Zhang *FetchAndOpLocal = NULL; 1045cd620004SJunchao Zhang if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1046cd620004SJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; 1047cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1048cd620004SJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 1049cd620004SJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 1050cd620004SJunchao Zhang #endif 1051cd620004SJunchao Zhang PetscFunctionReturn(0); 1052cd620004SJunchao Zhang } 1053cd620004SJunchao Zhang 1054cd620004SJunchao Zhang /*============================================================================= 1055cd620004SJunchao Zhang A set of helper routines for Pack/Unpack/Scatter on GPUs 1056cd620004SJunchao Zhang ============================================================================*/ 1057cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1058cd620004SJunchao Zhang /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to 1059cd620004SJunchao Zhang make sure the data is ready for packing. 1060cd620004SJunchao Zhang */ 1061cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link) 1062cd620004SJunchao Zhang { 1063cd620004SJunchao Zhang PetscFunctionBegin; 1064cd620004SJunchao Zhang if (sf->use_default_stream) PetscFunctionReturn(0); 1065cd620004SJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) { 1066cd620004SJunchao Zhang cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr); 1067cd620004SJunchao Zhang } 1068cd620004SJunchao Zhang PetscFunctionReturn(0); 1069cd620004SJunchao Zhang } 1070cd620004SJunchao Zhang 1071cd620004SJunchao Zhang /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */ 1072cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link) 1073cd620004SJunchao Zhang { 1074cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1075cd620004SJunchao Zhang 1076cd620004SJunchao Zhang PetscFunctionBegin; 1077b85e67b7SJunchao Zhang /* Do nothing if we use stream aware mpi || has nothing for remote */ 1078b85e67b7SJunchao Zhang if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 1079b85e67b7SJunchao Zhang /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */ 1080b85e67b7SJunchao Zhang if (!link->rootdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi || sf->use_default_stream) { 1081cd620004SJunchao Zhang cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1082cd620004SJunchao Zhang } 1083cd620004SJunchao Zhang PetscFunctionReturn(0); 1084cd620004SJunchao Zhang } 1085cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link) 1086cd620004SJunchao Zhang { 1087cd620004SJunchao Zhang PetscFunctionBegin; 1088b85e67b7SJunchao Zhang /* See comments above */ 1089b85e67b7SJunchao Zhang if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 1090b85e67b7SJunchao Zhang if (!link->leafdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi || sf->use_default_stream) { 1091cd620004SJunchao Zhang cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1092cd620004SJunchao Zhang } 1093cd620004SJunchao Zhang PetscFunctionReturn(0); 1094cd620004SJunchao Zhang } 1095cd620004SJunchao Zhang 1096cd620004SJunchao Zhang /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF 1097cd620004SJunchao Zhang does not know which stream the callers will use. 1098cd620004SJunchao Zhang */ 1099cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link) 1100cd620004SJunchao Zhang { 1101cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1102cd620004SJunchao Zhang 1103cd620004SJunchao Zhang PetscFunctionBegin; 1104b85e67b7SJunchao Zhang /* Do nothing if we are expected to put rootdata on default stream */ 1105b85e67b7SJunchao Zhang if (sf->use_default_stream || link->rootmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0); 1106b85e67b7SJunchao Zhang /* If we have something from local, then we called a scatter kernel (on link->stream), then we must sync it; 1107b85e67b7SJunchao Zhang If we have something from remote and we called unpack kernel, then we must also sycn it. 1108b85e67b7SJunchao Zhang */ 1109b85e67b7SJunchao Zhang if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) { 1110cd620004SJunchao Zhang cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1111cd620004SJunchao Zhang } 1112cd620004SJunchao Zhang PetscFunctionReturn(0); 1113cd620004SJunchao Zhang } 1114cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link) 1115cd620004SJunchao Zhang { 1116cd620004SJunchao Zhang PetscFunctionBegin; 1117b85e67b7SJunchao Zhang /* See comments above */ 1118b85e67b7SJunchao Zhang if (sf->use_default_stream || link->leafmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0); 1119b85e67b7SJunchao Zhang if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) { 1120cd620004SJunchao Zhang cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1121cd620004SJunchao Zhang } 1122cd620004SJunchao Zhang PetscFunctionReturn(0); 1123cd620004SJunchao Zhang } 1124cd620004SJunchao Zhang 1125cd620004SJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 1126cd620004SJunchao Zhang to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 1127cd620004SJunchao Zhang */ 1128cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1129cd620004SJunchao Zhang { 1130cd620004SJunchao Zhang PetscErrorCode ierr; 1131cd620004SJunchao Zhang cudaError_t cerr; 1132cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1133cd620004SJunchao Zhang 1134cd620004SJunchao Zhang PetscFunctionBegin; 1135cd620004SJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) { 1136cd620004SJunchao Zhang void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1137cd620004SJunchao Zhang void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1138cd620004SJunchao Zhang size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes; 1139cd620004SJunchao Zhang if (device2host) { 1140cd620004SJunchao Zhang cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1141cd620004SJunchao Zhang ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1142cd620004SJunchao Zhang } else { 1143cd620004SJunchao Zhang cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1144cd620004SJunchao Zhang ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1145cd620004SJunchao Zhang } 1146cd620004SJunchao Zhang } 1147cd620004SJunchao Zhang PetscFunctionReturn(0); 1148cd620004SJunchao Zhang } 1149cd620004SJunchao Zhang 1150cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1151cd620004SJunchao Zhang { 1152cd620004SJunchao Zhang PetscErrorCode ierr; 1153cd620004SJunchao Zhang cudaError_t cerr; 1154cd620004SJunchao Zhang 1155cd620004SJunchao Zhang PetscFunctionBegin; 1156cd620004SJunchao Zhang if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) { 1157cd620004SJunchao Zhang void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1158cd620004SJunchao Zhang void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1159cd620004SJunchao Zhang size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes; 1160cd620004SJunchao Zhang if (device2host) { 1161cd620004SJunchao Zhang cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1162cd620004SJunchao Zhang ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1163cd620004SJunchao Zhang } else { 1164cd620004SJunchao Zhang cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1165cd620004SJunchao Zhang ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1166cd620004SJunchao Zhang } 1167cd620004SJunchao Zhang } 1168cd620004SJunchao Zhang PetscFunctionReturn(0); 1169cd620004SJunchao Zhang } 1170cd620004SJunchao Zhang #else 1171cd620004SJunchao Zhang 1172cd620004SJunchao Zhang #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0 1173cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0 1174cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0 1175cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0 1176cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0 1177cd620004SJunchao Zhang #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1178cd620004SJunchao Zhang #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1179cd620004SJunchao Zhang 1180cd620004SJunchao Zhang #endif 1181cd620004SJunchao Zhang 1182cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1183cd620004SJunchao Zhang { 1184cd620004SJunchao Zhang PetscErrorCode ierr; 1185cd620004SJunchao Zhang PetscLogDouble flops; 1186cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1187cd620004SJunchao Zhang 1188cd620004SJunchao Zhang 1189cd620004SJunchao Zhang PetscFunctionBegin; 1190cd620004SJunchao Zhang if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1191cd620004SJunchao Zhang flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1192cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1193cd620004SJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1194cd620004SJunchao Zhang #endif 1195cd620004SJunchao Zhang {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1196cd620004SJunchao Zhang } 1197cd620004SJunchao Zhang PetscFunctionReturn(0); 1198cd620004SJunchao Zhang } 1199cd620004SJunchao Zhang 1200cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1201cd620004SJunchao Zhang { 1202cd620004SJunchao Zhang PetscLogDouble flops; 1203cd620004SJunchao Zhang PetscErrorCode ierr; 1204cd620004SJunchao Zhang 1205cd620004SJunchao Zhang PetscFunctionBegin; 1206cd620004SJunchao Zhang if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1207cd620004SJunchao Zhang flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1208cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1209cd620004SJunchao Zhang if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1210cd620004SJunchao Zhang #endif 1211cd620004SJunchao Zhang {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1212cd620004SJunchao Zhang } 1213cd620004SJunchao Zhang PetscFunctionReturn(0); 1214cd620004SJunchao Zhang } 1215cd620004SJunchao Zhang 1216cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 1217cd620004SJunchao Zhang Input Arguments: 1218cd620004SJunchao Zhang +sf - The StarForest 1219cd620004SJunchao Zhang .link - The link 1220cd620004SJunchao Zhang .count - Number of entries to unpack 1221cd620004SJunchao Zhang .start - The first index, significent when indices=NULL 1222cd620004SJunchao Zhang .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 1223cd620004SJunchao Zhang .buf - A contiguous buffer to unpack from 1224cd620004SJunchao Zhang -op - Operation after unpack 1225cd620004SJunchao Zhang 1226cd620004SJunchao Zhang Output Arguments: 1227cd620004SJunchao Zhang .data - The data to unpack to 1228cd620004SJunchao Zhang */ 1229cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 1230cd620004SJunchao Zhang { 1231cd620004SJunchao Zhang PetscFunctionBegin; 1232cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1233cd620004SJunchao Zhang { 1234cd620004SJunchao Zhang PetscErrorCode ierr; 1235cd620004SJunchao Zhang PetscInt i; 1236cd620004SJunchao Zhang PetscMPIInt n; 1237cd620004SJunchao Zhang if (indices) { 1238cd620004SJunchao Zhang /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 1239cd620004SJunchao Zhang basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 1240cd620004SJunchao Zhang */ 1241cd620004SJunchao Zhang for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 1242cd620004SJunchao Zhang } else { 1243cd620004SJunchao Zhang ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 1244cd620004SJunchao Zhang ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr); 1245cd620004SJunchao Zhang } 1246cd620004SJunchao Zhang } 1247cd620004SJunchao Zhang #else 1248cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1249cd620004SJunchao Zhang #endif 1250cd620004SJunchao Zhang PetscFunctionReturn(0); 1251cd620004SJunchao Zhang } 1252cd620004SJunchao Zhang 1253*fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 1254cd620004SJunchao Zhang { 1255cd620004SJunchao Zhang PetscFunctionBegin; 1256cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1257cd620004SJunchao Zhang { 1258cd620004SJunchao Zhang PetscErrorCode ierr; 1259cd620004SJunchao Zhang PetscInt i,disp; 1260*fcc7397dSJunchao Zhang if (!srcIdx) { 1261*fcc7397dSJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 1262*fcc7397dSJunchao Zhang } else { 1263cd620004SJunchao Zhang for (i=0; i<count; i++) { 1264*fcc7397dSJunchao Zhang disp = dstIdx? dstIdx[i] : dstStart + i; 1265*fcc7397dSJunchao Zhang ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr); 1266*fcc7397dSJunchao Zhang } 1267cd620004SJunchao Zhang } 1268cd620004SJunchao Zhang } 1269cd620004SJunchao Zhang #else 1270cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1271cd620004SJunchao Zhang #endif 1272cd620004SJunchao Zhang PetscFunctionReturn(0); 1273cd620004SJunchao Zhang } 1274cd620004SJunchao Zhang 1275cd620004SJunchao Zhang /*============================================================================= 1276cd620004SJunchao Zhang Pack/Unpack/Fetch/Scatter routines 1277cd620004SJunchao Zhang ============================================================================*/ 1278cd620004SJunchao Zhang 1279cd620004SJunchao Zhang /* Pack rootdata to rootbuf 1280cd620004SJunchao Zhang Input Arguments: 1281cd620004SJunchao Zhang + sf - The SF this packing works on. 1282cd620004SJunchao Zhang . link - It gives the memtype of the roots and also provides root buffer. 1283cd620004SJunchao Zhang . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1284cd620004SJunchao Zhang - rootdata - Where to read the roots. 1285cd620004SJunchao Zhang 1286cd620004SJunchao Zhang Notes: 1287cd620004SJunchao Zhang When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1288cd620004SJunchao Zhang in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) 1289cd620004SJunchao Zhang */ 1290cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1291cd620004SJunchao Zhang { 1292cd620004SJunchao Zhang PetscErrorCode ierr; 1293cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1294cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1295cd620004SJunchao Zhang PetscInt count,start; 1296*fcc7397dSJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1297cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1298*fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1299*fcc7397dSJunchao Zhang 1300cd620004SJunchao Zhang 1301cd620004SJunchao Zhang PetscFunctionBegin; 1302cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1303cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1304cd620004SJunchao Zhang if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1305*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1306cd620004SJunchao Zhang ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1307*fcc7397dSJunchao Zhang ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1308cd620004SJunchao Zhang } 1309cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) { 1310cd620004SJunchao Zhang ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1311cd620004SJunchao Zhang ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); 1312cd620004SJunchao Zhang } 1313cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1314cd620004SJunchao Zhang PetscFunctionReturn(0); 1315cd620004SJunchao Zhang } 1316cd620004SJunchao Zhang 1317cd620004SJunchao Zhang /* Pack leafdata to leafbuf */ 1318cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1319cd620004SJunchao Zhang { 1320cd620004SJunchao Zhang PetscErrorCode ierr; 1321cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 1322cd620004SJunchao Zhang PetscInt count,start; 1323*fcc7397dSJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1324cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 1325*fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1326cd620004SJunchao Zhang 1327cd620004SJunchao Zhang PetscFunctionBegin; 1328cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1329cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1330cd620004SJunchao Zhang if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1331*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1332cd620004SJunchao Zhang ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1333*fcc7397dSJunchao Zhang ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1334cd620004SJunchao Zhang } 1335cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) { 1336cd620004SJunchao Zhang ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1337cd620004SJunchao Zhang ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); 1338cd620004SJunchao Zhang } 1339cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1340cd620004SJunchao Zhang PetscFunctionReturn(0); 1341cd620004SJunchao Zhang } 1342cd620004SJunchao Zhang 1343cd620004SJunchao Zhang /* Unpack rootbuf to rootdata */ 1344cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1345cd620004SJunchao Zhang { 1346cd620004SJunchao Zhang PetscErrorCode ierr; 1347cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1348cd620004SJunchao Zhang PetscInt count,start; 1349cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1350*fcc7397dSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1351cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1352*fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1353cd620004SJunchao Zhang 1354cd620004SJunchao Zhang PetscFunctionBegin; 1355cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1356cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1357cd620004SJunchao Zhang if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1358cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1359cd620004SJunchao Zhang if (UnpackAndOp) { 1360*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1361*fcc7397dSJunchao Zhang ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1362cd620004SJunchao Zhang } else { 1363*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1364cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1365cd620004SJunchao Zhang } 1366cd620004SJunchao Zhang } 1367cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} 1368cd620004SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1369cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1370cd620004SJunchao Zhang PetscFunctionReturn(0); 1371cd620004SJunchao Zhang } 1372cd620004SJunchao Zhang 1373cd620004SJunchao Zhang /* Unpack leafbuf to leafdata */ 1374cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1375cd620004SJunchao Zhang { 1376cd620004SJunchao Zhang PetscErrorCode ierr; 1377cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 1378cd620004SJunchao Zhang PetscInt count,start; 1379*fcc7397dSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1380cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 1381*fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1382cd620004SJunchao Zhang 1383cd620004SJunchao Zhang PetscFunctionBegin; 1384cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1385cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1386cd620004SJunchao Zhang if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1387cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1388cd620004SJunchao Zhang if (UnpackAndOp) { 1389*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1390*fcc7397dSJunchao Zhang ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1391cd620004SJunchao Zhang } else { 1392*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1393cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1394cd620004SJunchao Zhang } 1395cd620004SJunchao Zhang } 1396cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} 1397cd620004SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1398cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1399cd620004SJunchao Zhang PetscFunctionReturn(0); 1400cd620004SJunchao Zhang } 1401cd620004SJunchao Zhang 1402cd620004SJunchao Zhang /* FetchAndOp rootdata with rootbuf */ 1403cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1404cd620004SJunchao Zhang { 1405cd620004SJunchao Zhang PetscErrorCode ierr; 1406cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1407cd620004SJunchao Zhang PetscInt count,start; 1408cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1409*fcc7397dSJunchao Zhang PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1410cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1411*fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1412cd620004SJunchao Zhang 1413cd620004SJunchao Zhang PetscFunctionBegin; 1414cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1415cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1416cd620004SJunchao Zhang if (bas->rootbuflen[scope]) { 1417cd620004SJunchao Zhang /* Do FetchAndOp on rootdata with rootbuf */ 1418cd620004SJunchao Zhang ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); 1419*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1420*fcc7397dSJunchao Zhang ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1421cd620004SJunchao Zhang } 1422cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) { 1423cd620004SJunchao Zhang ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 1424cd620004SJunchao Zhang ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); 1425cd620004SJunchao Zhang } 1426cd620004SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1427cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1428cd620004SJunchao Zhang PetscFunctionReturn(0); 1429cd620004SJunchao Zhang } 1430cd620004SJunchao Zhang 1431cd620004SJunchao Zhang /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ 1432cd620004SJunchao Zhang PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) 1433cd620004SJunchao Zhang { 1434cd620004SJunchao Zhang PetscErrorCode ierr; 1435cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1436cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1437cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1438*fcc7397dSJunchao Zhang PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1439cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1440*fcc7397dSJunchao Zhang PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1441cd620004SJunchao Zhang 1442cd620004SJunchao Zhang PetscFunctionBegin; 1443cd620004SJunchao Zhang if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1444cd620004SJunchao Zhang if (rootmtype != leafmtype) { /* Uncommon case */ 1445cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */ 1446cd620004SJunchao Zhang ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1447f01131f0SJunchao Zhang ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1448cd620004SJunchao Zhang ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1449cd620004SJunchao Zhang } else { 1450cd620004SJunchao Zhang ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1451cd620004SJunchao Zhang if (ScatterAndOp) { 1452*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1453*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1454*fcc7397dSJunchao Zhang ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1455cd620004SJunchao Zhang } else { 1456*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1457*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1458*fcc7397dSJunchao Zhang ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1459cd620004SJunchao Zhang } 1460cd620004SJunchao Zhang } 1461cd620004SJunchao Zhang PetscFunctionReturn(0); 1462cd620004SJunchao Zhang } 1463cd620004SJunchao Zhang 1464cd620004SJunchao Zhang /* Reduce leafdata to rootdata locally */ 1465cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) 1466cd620004SJunchao Zhang { 1467cd620004SJunchao Zhang PetscErrorCode ierr; 1468cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1469cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1470cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1471*fcc7397dSJunchao Zhang PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1472cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1473*fcc7397dSJunchao Zhang PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1474cd620004SJunchao Zhang 1475cd620004SJunchao Zhang PetscFunctionBegin; 1476cd620004SJunchao Zhang if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1477cd620004SJunchao Zhang if (rootmtype != leafmtype) { 1478cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */ 1479cd620004SJunchao Zhang ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1480f01131f0SJunchao Zhang ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1481cd620004SJunchao Zhang ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1482cd620004SJunchao Zhang } else { 1483cd620004SJunchao Zhang ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1484cd620004SJunchao Zhang if (ScatterAndOp) { 1485*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1486*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1487*fcc7397dSJunchao Zhang ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1488cd620004SJunchao Zhang } else { 1489*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1490*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1491*fcc7397dSJunchao Zhang ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1492cd620004SJunchao Zhang } 1493cd620004SJunchao Zhang } 1494cd620004SJunchao Zhang PetscFunctionReturn(0); 1495cd620004SJunchao Zhang } 1496cd620004SJunchao Zhang 1497cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */ 1498cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1499cd620004SJunchao Zhang { 1500cd620004SJunchao Zhang PetscErrorCode ierr; 1501cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1502cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1503cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1504*fcc7397dSJunchao Zhang PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1505cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1506*fcc7397dSJunchao Zhang PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1507cd620004SJunchao Zhang 1508cd620004SJunchao Zhang PetscFunctionBegin; 1509cd620004SJunchao Zhang if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1510cd620004SJunchao Zhang if (rootmtype != leafmtype) { 1511cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */ 1512cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1513cd620004SJunchao Zhang } else { 1514*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1515*fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1516cd620004SJunchao Zhang ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1517*fcc7397dSJunchao Zhang ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1518cd620004SJunchao Zhang } 151940e23c03SJunchao Zhang PetscFunctionReturn(0); 152040e23c03SJunchao Zhang } 152140e23c03SJunchao Zhang 152240e23c03SJunchao Zhang /* 1523cd620004SJunchao Zhang Create per-rank pack/unpack optimizations based on indice patterns 152440e23c03SJunchao Zhang 152540e23c03SJunchao Zhang Input Parameters: 1526*fcc7397dSJunchao Zhang + n - Number of destination ranks 1527eb02082bSJunchao Zhang . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1528b23bfdefSJunchao Zhang - idx - [*] Array storing indices 152940e23c03SJunchao Zhang 153040e23c03SJunchao Zhang Output Parameters: 1531cd620004SJunchao Zhang + opt - Pack optimizations. NULL if no optimizations. 153240e23c03SJunchao Zhang */ 1533cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 153440e23c03SJunchao Zhang { 153540e23c03SJunchao Zhang PetscErrorCode ierr; 1536*fcc7397dSJunchao Zhang PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1537*fcc7397dSJunchao Zhang PetscBool optimizable = PETSC_TRUE; 153840e23c03SJunchao Zhang PetscSFPackOpt opt; 153940e23c03SJunchao Zhang 154040e23c03SJunchao Zhang PetscFunctionBegin; 1541*fcc7397dSJunchao Zhang ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1542*fcc7397dSJunchao Zhang ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1543*fcc7397dSJunchao Zhang opt->n = opt->array[0] = n; 1544*fcc7397dSJunchao Zhang opt->offset = opt->array + 1; 1545*fcc7397dSJunchao Zhang opt->start = opt->array + n + 2; 1546*fcc7397dSJunchao Zhang opt->dx = opt->array + 2*n + 2; 1547*fcc7397dSJunchao Zhang opt->dy = opt->array + 3*n + 2; 1548*fcc7397dSJunchao Zhang opt->dz = opt->array + 4*n + 2; 1549*fcc7397dSJunchao Zhang opt->X = opt->array + 5*n + 2; 1550*fcc7397dSJunchao Zhang opt->Y = opt->array + 6*n + 2; 1551*fcc7397dSJunchao Zhang 1552*fcc7397dSJunchao Zhang for (r=0; r<n; r++) { /* For each destination rank */ 1553*fcc7397dSJunchao Zhang m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1554*fcc7397dSJunchao Zhang p = offset[r]; 1555*fcc7397dSJunchao Zhang start = idx[p]; /* First index for this rank */ 1556*fcc7397dSJunchao Zhang p++; 1557*fcc7397dSJunchao Zhang 1558*fcc7397dSJunchao Zhang /* Search in X dimension */ 1559*fcc7397dSJunchao Zhang for (dx=1; dx<m; dx++,p++) { 1560*fcc7397dSJunchao Zhang if (start+dx != idx[p]) break; 1561b23bfdefSJunchao Zhang } 1562b23bfdefSJunchao Zhang 1563*fcc7397dSJunchao Zhang dydz = m/dx; 1564*fcc7397dSJunchao Zhang X = dydz > 1 ? (idx[p]-start) : dx; 1565*fcc7397dSJunchao Zhang /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1566*fcc7397dSJunchao Zhang if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1567*fcc7397dSJunchao Zhang for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1568*fcc7397dSJunchao Zhang for (i=0; i<dx; i++,p++) { 1569*fcc7397dSJunchao Zhang if (start+X*dy+i != idx[p]) { 1570*fcc7397dSJunchao Zhang if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1571*fcc7397dSJunchao Zhang else goto Z_dimension; 157240e23c03SJunchao Zhang } 157340e23c03SJunchao Zhang } 157440e23c03SJunchao Zhang } 157540e23c03SJunchao Zhang 1576*fcc7397dSJunchao Zhang Z_dimension: 1577*fcc7397dSJunchao Zhang dz = m/(dx*dy); 1578*fcc7397dSJunchao Zhang Y = dz > 1 ? (idx[p]-start)/X : dy; 1579*fcc7397dSJunchao Zhang /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1580*fcc7397dSJunchao Zhang if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1581*fcc7397dSJunchao Zhang for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1582*fcc7397dSJunchao Zhang for (j=0; j<dy; j++) { 1583*fcc7397dSJunchao Zhang for (i=0; i<dx; i++,p++) { 1584*fcc7397dSJunchao Zhang if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 158540e23c03SJunchao Zhang } 158640e23c03SJunchao Zhang } 158740e23c03SJunchao Zhang } 1588*fcc7397dSJunchao Zhang opt->start[r] = start; 1589*fcc7397dSJunchao Zhang opt->dx[r] = dx; 1590*fcc7397dSJunchao Zhang opt->dy[r] = dy; 1591*fcc7397dSJunchao Zhang opt->dz[r] = dz; 1592*fcc7397dSJunchao Zhang opt->X[r] = X; 1593*fcc7397dSJunchao Zhang opt->Y[r] = Y; 159440e23c03SJunchao Zhang } 159540e23c03SJunchao Zhang 1596*fcc7397dSJunchao Zhang finish: 1597*fcc7397dSJunchao Zhang /* If not optimizable, free arrays to save memory */ 1598*fcc7397dSJunchao Zhang if (!n || !optimizable) { 1599*fcc7397dSJunchao Zhang ierr = PetscFree(opt->array);CHKERRQ(ierr); 160040e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 160140e23c03SJunchao Zhang *out = NULL; 1602*fcc7397dSJunchao Zhang } else { 1603*fcc7397dSJunchao Zhang opt->offset[0] = 0; 1604*fcc7397dSJunchao Zhang for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1605*fcc7397dSJunchao Zhang *out = opt; 1606*fcc7397dSJunchao Zhang } 160740e23c03SJunchao Zhang PetscFunctionReturn(0); 160840e23c03SJunchao Zhang } 160940e23c03SJunchao Zhang 1610*fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out) 161140e23c03SJunchao Zhang { 161240e23c03SJunchao Zhang PetscErrorCode ierr; 161340e23c03SJunchao Zhang PetscSFPackOpt opt = *out; 161440e23c03SJunchao Zhang 161540e23c03SJunchao Zhang PetscFunctionBegin; 161640e23c03SJunchao Zhang if (opt) { 1617*fcc7397dSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) {ierr = PetscFree(opt->array);CHKERRQ(ierr);} 1618*fcc7397dSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1619*fcc7397dSJunchao Zhang else {cudaError_t cerr = cudaFree(opt->array);CHKERRCUDA(cerr);opt->array=NULL;} 1620*fcc7397dSJunchao Zhang #endif 162140e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 162240e23c03SJunchao Zhang *out = NULL; 162340e23c03SJunchao Zhang } 162440e23c03SJunchao Zhang PetscFunctionReturn(0); 162540e23c03SJunchao Zhang } 1626cd620004SJunchao Zhang 1627cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1628cd620004SJunchao Zhang { 1629cd620004SJunchao Zhang PetscErrorCode ierr; 1630cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1631cd620004SJunchao Zhang PetscInt i,j; 1632cd620004SJunchao Zhang 1633cd620004SJunchao Zhang PetscFunctionBegin; 1634cd620004SJunchao Zhang /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1635cd620004SJunchao Zhang for (i=0; i<2; i++) { /* Set defaults */ 1636cd620004SJunchao Zhang sf->leafstart[i] = 0; 1637cd620004SJunchao Zhang sf->leafcontig[i] = PETSC_TRUE; 1638cd620004SJunchao Zhang sf->leafdups[i] = PETSC_FALSE; 1639cd620004SJunchao Zhang bas->rootstart[i] = 0; 1640cd620004SJunchao Zhang bas->rootcontig[i] = PETSC_TRUE; 1641cd620004SJunchao Zhang bas->rootdups[i] = PETSC_FALSE; 1642cd620004SJunchao Zhang } 1643cd620004SJunchao Zhang 1644cd620004SJunchao Zhang sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1645cd620004SJunchao Zhang sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1646cd620004SJunchao Zhang 1647cd620004SJunchao Zhang if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1648cd620004SJunchao Zhang if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1649cd620004SJunchao Zhang 1650cd620004SJunchao Zhang /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1651cd620004SJunchao Zhang for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1652cd620004SJunchao Zhang if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1653cd620004SJunchao Zhang } 1654cd620004SJunchao Zhang for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1655cd620004SJunchao Zhang if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1656cd620004SJunchao Zhang } 1657cd620004SJunchao Zhang 1658cd620004SJunchao Zhang /* If not, see if we can have per-rank optimizations by doing index analysis */ 1659cd620004SJunchao Zhang if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1660cd620004SJunchao Zhang if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1661cd620004SJunchao Zhang 1662cd620004SJunchao Zhang /* Are root indices for self and remote contiguous? */ 1663cd620004SJunchao Zhang bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1664cd620004SJunchao Zhang bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1665cd620004SJunchao Zhang 1666cd620004SJunchao Zhang if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1667cd620004SJunchao Zhang if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1668cd620004SJunchao Zhang 1669cd620004SJunchao Zhang for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1670cd620004SJunchao Zhang if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1671cd620004SJunchao Zhang } 1672cd620004SJunchao Zhang for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1673cd620004SJunchao Zhang if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1674cd620004SJunchao Zhang } 1675cd620004SJunchao Zhang 1676cd620004SJunchao Zhang if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1677cd620004SJunchao Zhang if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1678cd620004SJunchao Zhang 1679cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1680cd620004SJunchao Zhang /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1681cd620004SJunchao Zhang if (!sf->leafcontig[0]) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1682cd620004SJunchao Zhang if (!sf->leafcontig[1]) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1683cd620004SJunchao Zhang if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1684cd620004SJunchao Zhang if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1685cd620004SJunchao Zhang #endif 1686cd620004SJunchao Zhang 1687cd620004SJunchao Zhang PetscFunctionReturn(0); 1688cd620004SJunchao Zhang } 1689cd620004SJunchao Zhang 1690cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1691cd620004SJunchao Zhang { 1692cd620004SJunchao Zhang PetscErrorCode ierr; 1693cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1694cd620004SJunchao Zhang PetscInt i; 1695cd620004SJunchao Zhang 1696cd620004SJunchao Zhang PetscFunctionBegin; 1697cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1698*fcc7397dSJunchao Zhang ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 1699*fcc7397dSJunchao Zhang ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 1700*fcc7397dSJunchao Zhang ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 1701*fcc7397dSJunchao Zhang ierr = PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 1702cd620004SJunchao Zhang } 1703cd620004SJunchao Zhang PetscFunctionReturn(0); 1704cd620004SJunchao Zhang } 1705