120c24465SJunchao Zhang #include "petsc/private/sfimpl.h" 240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 440e23c03SJunchao Zhang 57fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 67fd2d3dbSJunchao Zhang 740e23c03SJunchao Zhang /* 840e23c03SJunchao Zhang * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 940e23c03SJunchao Zhang * therefore we pack data types manually. This file defines packing routines for the standard data types. 1040e23c03SJunchao Zhang */ 1140e23c03SJunchao Zhang 12cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 1340e23c03SJunchao Zhang 14cd620004SJunchao Zhang /* Operations working like s += t */ 15cd620004SJunchao Zhang #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 16cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 17cd620004SJunchao Zhang #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 18cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 19cd620004SJunchao Zhang /* Ref MPI MAXLOC */ 20cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \ 21cd620004SJunchao Zhang do { \ 22cd620004SJunchao Zhang if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 23cd620004SJunchao Zhang else if (!((s).u op (t).u)) s = t; \ 24cd620004SJunchao Zhang } while (0) 2540e23c03SJunchao Zhang 2640e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine 2740e23c03SJunchao Zhang 2840e23c03SJunchao Zhang Arguments of the macro: 29b23bfdefSJunchao Zhang +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 30fcc7397dSJunchao Zhang .BS Block size for vectorization. It is a factor of bsz. 31b23bfdefSJunchao Zhang -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 3240e23c03SJunchao Zhang 3340e23c03SJunchao Zhang Arguments of the Pack routine: 34cd620004SJunchao Zhang +count Number of indices in idx[]. 35fcc7397dSJunchao Zhang .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 36fcc7397dSJunchao Zhang .opt Per-pack optimization plan. NULL means no such plan. 37fcc7397dSJunchao Zhang .idx Indices of entries to packed. 38eb02082bSJunchao Zhang .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39cd620004SJunchao Zhang .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 40cd620004SJunchao Zhang -packed Address of the packed data. 4140e23c03SJunchao Zhang */ 42b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \ 43fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 44b23bfdefSJunchao Zhang { \ 45b23bfdefSJunchao Zhang const Type *u = (const Type*)unpacked,*u2; \ 46b23bfdefSJunchao Zhang Type *p = (Type*)packed,*p2; \ 47fcc7397dSJunchao Zhang PetscInt i,j,k,X,Y,r,bs=link->bs; \ 48fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 49b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 5040e23c03SJunchao Zhang PetscFunctionBegin; \ 519566063dSJacob Faibussowitsch if (!idx) PetscCall(PetscArraycpy(p,u+start*MBS,MBS*count));/* idx[] are contiguous */ \ 52fcc7397dSJunchao Zhang else if (opt) { /* has optimizations available */ \ 53fcc7397dSJunchao Zhang p2 = p; \ 54fcc7397dSJunchao Zhang for (r=0; r<opt->n; r++) { \ 55fcc7397dSJunchao Zhang u2 = u + opt->start[r]*MBS; \ 56fcc7397dSJunchao Zhang X = opt->X[r]; \ 57fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 58fcc7397dSJunchao Zhang for (k=0; k<opt->dz[r]; k++) \ 59fcc7397dSJunchao Zhang for (j=0; j<opt->dy[r]; j++) { \ 609566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS)); \ 61fcc7397dSJunchao Zhang p2 += opt->dx[r]*MBS; \ 62fcc7397dSJunchao Zhang } \ 63fcc7397dSJunchao Zhang } \ 64fcc7397dSJunchao Zhang } else { \ 65b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 66eb02082bSJunchao Zhang for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 67eb02082bSJunchao Zhang for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 68b23bfdefSJunchao Zhang p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 6940e23c03SJunchao Zhang } \ 7040e23c03SJunchao Zhang PetscFunctionReturn(0); \ 7140e23c03SJunchao Zhang } 7240e23c03SJunchao Zhang 73cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 74cd620004SJunchao Zhang and inserts into a sparse array. 7540e23c03SJunchao Zhang 7640e23c03SJunchao Zhang Arguments: 77b23bfdefSJunchao Zhang .Type Type of the data 7840e23c03SJunchao Zhang .BS Block size for vectorization 79b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 8040e23c03SJunchao Zhang 8140e23c03SJunchao Zhang Notes: 8240e23c03SJunchao Zhang This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 8340e23c03SJunchao Zhang */ 84cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ) \ 85fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 86b23bfdefSJunchao Zhang { \ 87b23bfdefSJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 88fcc7397dSJunchao Zhang const Type *p = (const Type*)packed; \ 89fcc7397dSJunchao Zhang PetscInt i,j,k,X,Y,r,bs=link->bs; \ 90fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 91b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 9240e23c03SJunchao Zhang PetscFunctionBegin; \ 93b23bfdefSJunchao Zhang if (!idx) { \ 94fcc7397dSJunchao Zhang u += start*MBS; \ 959566063dSJacob Faibussowitsch if (u != p) PetscCall(PetscArraycpy(u,p,count*MBS)); \ 96fcc7397dSJunchao Zhang } else if (opt) { /* has optimizations available */ \ 97fcc7397dSJunchao Zhang for (r=0; r<opt->n; r++) { \ 98fcc7397dSJunchao Zhang u2 = u + opt->start[r]*MBS; \ 99fcc7397dSJunchao Zhang X = opt->X[r]; \ 100fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 101fcc7397dSJunchao Zhang for (k=0; k<opt->dz[r]; k++) \ 102fcc7397dSJunchao Zhang for (j=0; j<opt->dy[r]; j++) { \ 1039566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS)); \ 104fcc7397dSJunchao Zhang p += opt->dx[r]*MBS; \ 105fcc7397dSJunchao Zhang } \ 106fcc7397dSJunchao Zhang } \ 107fcc7397dSJunchao Zhang } else { \ 108b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 109b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 110cd620004SJunchao Zhang for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 11140e23c03SJunchao Zhang } \ 11240e23c03SJunchao Zhang PetscFunctionReturn(0); \ 11340e23c03SJunchao Zhang } 11440e23c03SJunchao Zhang 115cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 11640e23c03SJunchao Zhang 11740e23c03SJunchao Zhang Arguments: 118cd620004SJunchao Zhang +Opname Name of the Op, such as Add, Mult, LAND, etc. 119b23bfdefSJunchao Zhang .Type Type of the data 12040e23c03SJunchao Zhang .BS Block size for vectorization 121b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 122cd620004SJunchao Zhang .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 123cd620004SJunchao Zhang .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 12440e23c03SJunchao Zhang */ 125cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 126fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 127b23bfdefSJunchao Zhang { \ 128cd620004SJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 129fcc7397dSJunchao Zhang const Type *p = (const Type*)packed; \ 130fcc7397dSJunchao Zhang PetscInt i,j,k,X,Y,r,bs=link->bs; \ 131fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 132b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 13340e23c03SJunchao Zhang PetscFunctionBegin; \ 134b23bfdefSJunchao Zhang if (!idx) { \ 135fcc7397dSJunchao Zhang u += start*MBS; \ 136cd620004SJunchao Zhang for (i=0; i<count; i++) \ 137cd620004SJunchao Zhang for (j=0; j<M; j++) \ 138cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 139cd620004SJunchao Zhang OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 140fcc7397dSJunchao Zhang } else if (opt) { /* idx[] has patterns */ \ 141fcc7397dSJunchao Zhang for (r=0; r<opt->n; r++) { \ 142fcc7397dSJunchao Zhang u2 = u + opt->start[r]*MBS; \ 143fcc7397dSJunchao Zhang X = opt->X[r]; \ 144fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 145fcc7397dSJunchao Zhang for (k=0; k<opt->dz[r]; k++) \ 146fcc7397dSJunchao Zhang for (j=0; j<opt->dy[r]; j++) { \ 147fcc7397dSJunchao Zhang for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 148fcc7397dSJunchao Zhang p += opt->dx[r]*MBS; \ 149fcc7397dSJunchao Zhang } \ 150fcc7397dSJunchao Zhang } \ 151fcc7397dSJunchao Zhang } else { \ 152cd620004SJunchao Zhang for (i=0; i<count; i++) \ 153cd620004SJunchao Zhang for (j=0; j<M; j++) \ 154cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 155cd620004SJunchao Zhang OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 156cd620004SJunchao Zhang } \ 157cd620004SJunchao Zhang PetscFunctionReturn(0); \ 158cd620004SJunchao Zhang } 159cd620004SJunchao Zhang 160cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 161fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 162cd620004SJunchao Zhang { \ 163fcc7397dSJunchao Zhang Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 164fcc7397dSJunchao Zhang PetscInt i,j,k,r,l,bs=link->bs; \ 165fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 166fcc7397dSJunchao Zhang const PetscInt MBS = M*BS; \ 167cd620004SJunchao Zhang PetscFunctionBegin; \ 168fcc7397dSJunchao Zhang for (i=0; i<count; i++) { \ 169fcc7397dSJunchao Zhang r = (!idx ? start+i : idx[i])*MBS; \ 170fcc7397dSJunchao Zhang l = i*MBS; \ 171b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 172b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 173fcc7397dSJunchao Zhang tmp = u[r+j*BS+k]; \ 174fcc7397dSJunchao Zhang OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 175fcc7397dSJunchao Zhang p[l+j*BS+k] = tmp; \ 176cd620004SJunchao Zhang } \ 177cd620004SJunchao Zhang } \ 178cd620004SJunchao Zhang PetscFunctionReturn(0); \ 179cd620004SJunchao Zhang } 180cd620004SJunchao Zhang 181cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 182fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 183cd620004SJunchao Zhang { \ 184fcc7397dSJunchao Zhang const Type *u = (const Type*)src; \ 185fcc7397dSJunchao Zhang Type *v = (Type*)dst; \ 186fcc7397dSJunchao Zhang PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 187cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 188cd620004SJunchao Zhang const PetscInt MBS = M*BS; \ 189cd620004SJunchao Zhang PetscFunctionBegin; \ 190fcc7397dSJunchao Zhang if (!srcIdx) { /* src is contiguous */ \ 191fcc7397dSJunchao Zhang u += srcStart*MBS; \ 1929566063dSJacob Faibussowitsch PetscCall(CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u)); \ 193fcc7397dSJunchao Zhang } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 194fcc7397dSJunchao Zhang u += srcOpt->start[0]*MBS; \ 195fcc7397dSJunchao Zhang v += dstStart*MBS; \ 196fcc7397dSJunchao Zhang X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 197fcc7397dSJunchao Zhang for (k=0; k<srcOpt->dz[0]; k++) \ 198fcc7397dSJunchao Zhang for (j=0; j<srcOpt->dy[0]; j++) { \ 199fcc7397dSJunchao Zhang for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 200fcc7397dSJunchao Zhang v += srcOpt->dx[0]*MBS; \ 201fcc7397dSJunchao Zhang } \ 202fcc7397dSJunchao Zhang } else { /* all other cases */ \ 203fcc7397dSJunchao Zhang for (i=0; i<count; i++) { \ 204fcc7397dSJunchao Zhang s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 205fcc7397dSJunchao Zhang t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 206cd620004SJunchao Zhang for (j=0; j<M; j++) \ 207fcc7397dSJunchao Zhang for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 208fcc7397dSJunchao Zhang } \ 209cd620004SJunchao Zhang } \ 210cd620004SJunchao Zhang PetscFunctionReturn(0); \ 211cd620004SJunchao Zhang } 212cd620004SJunchao Zhang 213cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 214fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 215cd620004SJunchao Zhang { \ 216fcc7397dSJunchao Zhang Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 217fcc7397dSJunchao Zhang const Type *ldata = (const Type*)leafdata; \ 218fcc7397dSJunchao Zhang PetscInt i,j,k,r,l,bs = link->bs; \ 219cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 220cd620004SJunchao Zhang const PetscInt MBS = M*BS; \ 221cd620004SJunchao Zhang PetscFunctionBegin; \ 222fcc7397dSJunchao Zhang for (i=0; i<count; i++) { \ 223fcc7397dSJunchao Zhang r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 224fcc7397dSJunchao Zhang l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 225cd620004SJunchao Zhang for (j=0; j<M; j++) \ 226cd620004SJunchao Zhang for (k=0; k<BS; k++) { \ 227fcc7397dSJunchao Zhang lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 228fcc7397dSJunchao Zhang OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 22940e23c03SJunchao Zhang } \ 23040e23c03SJunchao Zhang } \ 23140e23c03SJunchao Zhang PetscFunctionReturn(0); \ 23240e23c03SJunchao Zhang } 23340e23c03SJunchao Zhang 234b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */ 235b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ) \ 236b23bfdefSJunchao Zhang DEF_PackFunc(Type,BS,EQ) \ 237cd620004SJunchao Zhang DEF_UnpackFunc(Type,BS,EQ) \ 238cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 239cd620004SJunchao Zhang static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 240eb02082bSJunchao Zhang link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 241eb02082bSJunchao Zhang link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 242cd620004SJunchao Zhang link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 24340e23c03SJunchao Zhang } 24440e23c03SJunchao Zhang 245b23bfdefSJunchao Zhang /* Add, Mult ops */ 246b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ) \ 247cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 248cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 249cd620004SJunchao Zhang DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 250cd620004SJunchao Zhang DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 251cd620004SJunchao Zhang DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 252cd620004SJunchao Zhang DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 253cd620004SJunchao Zhang static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 254eb02082bSJunchao Zhang link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 255eb02082bSJunchao Zhang link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 256eb02082bSJunchao Zhang link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 257cd620004SJunchao Zhang link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 258cd620004SJunchao Zhang link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 259cd620004SJunchao Zhang link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 26040e23c03SJunchao Zhang } 26140e23c03SJunchao Zhang 262b23bfdefSJunchao Zhang /* Max, Min ops */ 263b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ) \ 264cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 265cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 266cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 267cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 268cd620004SJunchao Zhang static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 269eb02082bSJunchao Zhang link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 270eb02082bSJunchao Zhang link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 271cd620004SJunchao Zhang link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 272cd620004SJunchao Zhang link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 273b23bfdefSJunchao Zhang } 274b23bfdefSJunchao Zhang 275b23bfdefSJunchao Zhang /* Logical ops. 276cd620004SJunchao Zhang The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 27740e23c03SJunchao Zhang the compilation warning "empty macro arguments are undefined in ISO C90" 27840e23c03SJunchao Zhang */ 279b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ) \ 280cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 281cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 282cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 283cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 284cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 285cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 286cd620004SJunchao Zhang static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 287eb02082bSJunchao Zhang link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 288eb02082bSJunchao Zhang link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 289eb02082bSJunchao Zhang link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 290cd620004SJunchao Zhang link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 291cd620004SJunchao Zhang link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 292cd620004SJunchao Zhang link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 29340e23c03SJunchao Zhang } 29440e23c03SJunchao Zhang 295b23bfdefSJunchao Zhang /* Bitwise ops */ 296b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ) \ 297cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 298cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 299cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 300cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 301cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 302cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 303cd620004SJunchao Zhang static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 304eb02082bSJunchao Zhang link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 305eb02082bSJunchao Zhang link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 306eb02082bSJunchao Zhang link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 307cd620004SJunchao Zhang link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 308cd620004SJunchao Zhang link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 309cd620004SJunchao Zhang link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 31040e23c03SJunchao Zhang } 31140e23c03SJunchao Zhang 312cd620004SJunchao Zhang /* Maxloc, Minloc ops */ 313cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ) \ 314cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 315cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 316cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 317cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 318cd620004SJunchao Zhang static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 319cd620004SJunchao Zhang link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 320cd620004SJunchao Zhang link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 321cd620004SJunchao Zhang link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 322cd620004SJunchao Zhang link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 32340e23c03SJunchao Zhang } 32440e23c03SJunchao Zhang 325b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ) \ 326b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 327b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 328b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 329b23bfdefSJunchao Zhang DEF_Log(Type,BS,EQ) \ 330b23bfdefSJunchao Zhang DEF_Bit(Type,BS,EQ) \ 331cd620004SJunchao Zhang static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 332b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 333b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 334b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 335b23bfdefSJunchao Zhang CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 336b23bfdefSJunchao Zhang CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 33740e23c03SJunchao Zhang } 33840e23c03SJunchao Zhang 339b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ) \ 340b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 341b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 342b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 343cd620004SJunchao Zhang static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 344b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 345b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 346b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 347b23bfdefSJunchao Zhang } 34840e23c03SJunchao Zhang 34940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 350b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ) \ 351b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 352b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 353cd620004SJunchao Zhang static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 354b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 355b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 356b23bfdefSJunchao Zhang } 35740e23c03SJunchao Zhang #endif 358b23bfdefSJunchao Zhang 359b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ) \ 360b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 361cd620004SJunchao Zhang static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 362b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 363b23bfdefSJunchao Zhang } 364b23bfdefSJunchao Zhang 365b23bfdefSJunchao Zhang /* Maxloc, Minloc */ 366cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ) \ 367cd620004SJunchao Zhang DEF_Pack(Type,BS,EQ) \ 368cd620004SJunchao Zhang DEF_Xloc(Type,BS,EQ) \ 369cd620004SJunchao Zhang static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 370cd620004SJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 371cd620004SJunchao Zhang CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 372b23bfdefSJunchao Zhang } 373b23bfdefSJunchao Zhang 374b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 375b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 376b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 377b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 378b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 379b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 380b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 381b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 382b23bfdefSJunchao Zhang 383b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 384b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1) 385b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1) 386b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1) 387b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1) 388b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0) 389b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0) 390b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0) 391b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0) 392b23bfdefSJunchao Zhang #endif 393b23bfdefSJunchao Zhang 394b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */ 395b23bfdefSJunchao Zhang typedef signed char SignedChar; 396b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1) 397b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1) 398b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1) 399b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1) 400b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0) 401b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0) 402b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0) 403b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0) 404b23bfdefSJunchao Zhang 405b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar; 406b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1) 407b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1) 408b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1) 409b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1) 410b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0) 411b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0) 412b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0) 413b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0) 414b23bfdefSJunchao Zhang 415b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1) 416b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1) 417b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1) 418b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1) 419b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0) 420b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0) 421b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0) 422b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0) 423b23bfdefSJunchao Zhang 424b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 425b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1) 426b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1) 427b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1) 428b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1) 429b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0) 430b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0) 431b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0) 432b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0) 433b23bfdefSJunchao Zhang #endif 434b23bfdefSJunchao Zhang 435cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2 436cd620004SJunchao Zhang typedef struct {int u; int i;} PairType(int,int); 437cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 438cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1) 439cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1) 440b23bfdefSJunchao Zhang 441b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */ 442b23bfdefSJunchao Zhang DEF_DumbType(char,1,1) 443b23bfdefSJunchao Zhang DEF_DumbType(char,2,1) 444b23bfdefSJunchao Zhang DEF_DumbType(char,4,1) 445b23bfdefSJunchao Zhang DEF_DumbType(char,1,0) 446b23bfdefSJunchao Zhang DEF_DumbType(char,2,0) 447b23bfdefSJunchao Zhang DEF_DumbType(char,4,0) 448b23bfdefSJunchao Zhang 449eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 450b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1) 451b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1) 452b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1) 453b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1) 454b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0) 455b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0) 456b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0) 457b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0) 45840e23c03SJunchao Zhang 45971438e86SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link) 46040e23c03SJunchao Zhang { 461cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 46271438e86SJunchao Zhang PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 46371438e86SJunchao Zhang 46471438e86SJunchao Zhang PetscFunctionBegin; 46571438e86SJunchao Zhang /* Destroy device-specific fields */ 4669566063dSJacob Faibussowitsch if (link->deviceinited) PetscCall((*link->Destroy)(sf,link)); 46771438e86SJunchao Zhang 46871438e86SJunchao Zhang /* Destroy host related fields */ 4699566063dSJacob Faibussowitsch if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit)); 47071438e86SJunchao Zhang if (!link->use_nvshmem) { 47171438e86SJunchao Zhang for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 4729566063dSJacob Faibussowitsch if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i])); 47371438e86SJunchao Zhang } 4749566063dSJacob Faibussowitsch PetscCall(PetscFree(link->reqs)); 47571438e86SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 4769566063dSJacob Faibussowitsch PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST])); 4779566063dSJacob Faibussowitsch PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST])); 47871438e86SJunchao Zhang } 47971438e86SJunchao Zhang } 4809566063dSJacob Faibussowitsch PetscCall(PetscFree(link)); 48171438e86SJunchao Zhang PetscFunctionReturn(0); 48271438e86SJunchao Zhang } 48371438e86SJunchao Zhang 48471438e86SJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 48571438e86SJunchao Zhang { 486cd620004SJunchao Zhang PetscFunctionBegin; 4879566063dSJacob Faibussowitsch PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata)); 48871438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM) 48971438e86SJunchao Zhang { 49071438e86SJunchao Zhang PetscBool use_nvshmem; 4919566063dSJacob Faibussowitsch PetscCall(PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem)); 49271438e86SJunchao Zhang if (use_nvshmem) { 4939566063dSJacob Faibussowitsch PetscCall(PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink)); 49471438e86SJunchao Zhang PetscFunctionReturn(0); 495cd620004SJunchao Zhang } 496cd620004SJunchao Zhang } 4977fd2d3dbSJunchao Zhang #endif 4989566063dSJacob Faibussowitsch PetscCall(PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink)); 499cd620004SJunchao Zhang PetscFunctionReturn(0); 500cd620004SJunchao Zhang } 501cd620004SJunchao Zhang 502cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 503cd620004SJunchao Zhang If the sf uses persistent requests and the requests have not been initialized, then initialize them. 504cd620004SJunchao Zhang */ 505cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 506cd620004SJunchao Zhang { 507cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 508c87b50c4SJunchao Zhang PetscInt i,j,cnt,nrootranks,ndrootranks,nleafranks,ndleafranks; 509cd620004SJunchao Zhang const PetscInt *rootoffset,*leafoffset; 510cd620004SJunchao Zhang MPI_Aint disp; 511cd620004SJunchao Zhang MPI_Comm comm = PetscObjectComm((PetscObject)sf); 512cd620004SJunchao Zhang MPI_Datatype unit = link->unit; 513cd620004SJunchao Zhang const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 514cd620004SJunchao Zhang const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 515cd620004SJunchao Zhang 516cd620004SJunchao Zhang PetscFunctionBegin; 517cd620004SJunchao Zhang /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 518cd620004SJunchao Zhang if (sf->persistent) { 519cd620004SJunchao Zhang if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 5209566063dSJacob Faibussowitsch PetscCall(PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL)); 521cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 522cd620004SJunchao Zhang for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 523cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 524c87b50c4SJunchao Zhang cnt = rootoffset[i+1]-rootoffset[i]; 5259566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j)); 526cd620004SJunchao Zhang } 527cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 528cd620004SJunchao Zhang for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 529cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 530c87b50c4SJunchao Zhang cnt = rootoffset[i+1]-rootoffset[i]; 5319566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j)); 532cd620004SJunchao Zhang } 533cd620004SJunchao Zhang } 534cd620004SJunchao Zhang link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 535cd620004SJunchao Zhang } 536cd620004SJunchao Zhang 537cd620004SJunchao Zhang if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 5389566063dSJacob Faibussowitsch PetscCall(PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL)); 539cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 540cd620004SJunchao Zhang for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 541cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 542c87b50c4SJunchao Zhang cnt = leafoffset[i+1]-leafoffset[i]; 5439566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j)); 544cd620004SJunchao Zhang } 545cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 546cd620004SJunchao Zhang for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 547cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 548c87b50c4SJunchao Zhang cnt = leafoffset[i+1]-leafoffset[i]; 5499566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j)); 550cd620004SJunchao Zhang } 551cd620004SJunchao Zhang } 552cd620004SJunchao Zhang link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 553cd620004SJunchao Zhang } 554cd620004SJunchao Zhang } 555cd620004SJunchao Zhang if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 556cd620004SJunchao Zhang if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 557cd620004SJunchao Zhang if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 558cd620004SJunchao Zhang if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 559cd620004SJunchao Zhang PetscFunctionReturn(0); 560cd620004SJunchao Zhang } 561cd620004SJunchao Zhang 562cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 563cd620004SJunchao Zhang { 564cd620004SJunchao Zhang PetscSFLink link,*p; 56540e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 56640e23c03SJunchao Zhang 56740e23c03SJunchao Zhang PetscFunctionBegin; 56840e23c03SJunchao Zhang /* Look for types in cache */ 56940e23c03SJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 57040e23c03SJunchao Zhang PetscBool match; 5719566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit,link->unit,&match)); 572637e6665SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 57340e23c03SJunchao Zhang switch (cmode) { 57440e23c03SJunchao Zhang case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 57540e23c03SJunchao Zhang case PETSC_USE_POINTER: break; 57640e23c03SJunchao Zhang default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 57740e23c03SJunchao Zhang } 57840e23c03SJunchao Zhang *mylink = link; 57940e23c03SJunchao Zhang PetscFunctionReturn(0); 58040e23c03SJunchao Zhang } 58140e23c03SJunchao Zhang } 58240e23c03SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 58340e23c03SJunchao Zhang } 58440e23c03SJunchao Zhang 58571438e86SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink) 58640e23c03SJunchao Zhang { 58740e23c03SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 58871438e86SJunchao Zhang PetscSFLink link = *mylink; 58940e23c03SJunchao Zhang 59040e23c03SJunchao Zhang PetscFunctionBegin; 59171438e86SJunchao Zhang link->rootdata = NULL; 59271438e86SJunchao Zhang link->leafdata = NULL; 59371438e86SJunchao Zhang link->next = bas->avail; 59471438e86SJunchao Zhang bas->avail = link; 59571438e86SJunchao Zhang *mylink = NULL; 596eb02082bSJunchao Zhang PetscFunctionReturn(0); 597eb02082bSJunchao Zhang } 598eb02082bSJunchao Zhang 5999d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */ 600cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 6019d1c8addSJunchao Zhang { 602cd620004SJunchao Zhang PetscSFLink link,*p; 6039d1c8addSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 6049d1c8addSJunchao Zhang PetscBool match; 6059d1c8addSJunchao Zhang 6069d1c8addSJunchao Zhang PetscFunctionBegin; 607b458e8f1SJose E. Roman if (PetscDefined(USE_DEBUG)) { 60818fb5014SJunchao Zhang /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 60918fb5014SJunchao Zhang the potential overlapping since this process does not participate in communication. Overlapping is harmless. 61018fb5014SJunchao Zhang */ 611637e6665SJunchao Zhang if (rootdata || leafdata) { 6129d1c8addSJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 6139566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit,link->unit,&match)); 6142c71b3e2SJacob Faibussowitsch PetscCheckFalse(match && (rootdata == link->rootdata) && (leafdata == link->leafdata),PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 6159d1c8addSJunchao Zhang } 61618fb5014SJunchao Zhang } 617b458e8f1SJose E. Roman } 6189d1c8addSJunchao Zhang PetscFunctionReturn(0); 6199d1c8addSJunchao Zhang } 6209d1c8addSJunchao Zhang 62120c24465SJunchao Zhang static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 62220c24465SJunchao Zhang { 62320c24465SJunchao Zhang PetscFunctionBegin; 6249566063dSJacob Faibussowitsch if (n) PetscCall(PetscMemcpy(dst,src,n)); 62520c24465SJunchao Zhang PetscFunctionReturn(0); 62620c24465SJunchao Zhang } 62720c24465SJunchao Zhang 628cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 62940e23c03SJunchao Zhang { 630b23bfdefSJunchao Zhang PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 631b23bfdefSJunchao Zhang PetscBool is2Int,is2PetscInt; 63240e23c03SJunchao Zhang PetscMPIInt ni,na,nd,combiner; 63340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 634b23bfdefSJunchao Zhang PetscInt nPetscComplex=0; 63540e23c03SJunchao Zhang #endif 63640e23c03SJunchao Zhang 63740e23c03SJunchao Zhang PetscFunctionBegin; 6389566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar)); 6399566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar)); 640b23bfdefSJunchao Zhang /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 6419566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt)); 6429566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt)); 6439566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal)); 64440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 6459566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex)); 64640e23c03SJunchao Zhang #endif 6479566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int)); 6489566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt)); 649b23bfdefSJunchao Zhang /* TODO: shaell we also handle Fortran MPI_2REAL? */ 6509566063dSJacob Faibussowitsch PetscCallMPI(MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner)); 6515ad15460SJunchao Zhang link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 652b23bfdefSJunchao Zhang link->bs = 1; /* default */ 65340e23c03SJunchao Zhang 654eb02082bSJunchao Zhang if (is2Int) { 655cd620004SJunchao Zhang PackInit_PairType_int_int_1_1(link); 656eb02082bSJunchao Zhang link->bs = 1; 657eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(int); 6585ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 659eb02082bSJunchao Zhang link->basicunit = MPI_2INT; 6605ad15460SJunchao Zhang link->unit = MPI_2INT; 661eb02082bSJunchao Zhang } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 662cd620004SJunchao Zhang PackInit_PairType_PetscInt_PetscInt_1_1(link); 663eb02082bSJunchao Zhang link->bs = 1; 664eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(PetscInt); 665eb02082bSJunchao Zhang link->basicunit = MPIU_2INT; 6665ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 6675ad15460SJunchao Zhang link->unit = MPIU_2INT; 668eb02082bSJunchao Zhang } else if (nPetscReal) { 669b23bfdefSJunchao Zhang if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 670b23bfdefSJunchao Zhang else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 671b23bfdefSJunchao Zhang else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 672b23bfdefSJunchao Zhang else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 673b23bfdefSJunchao Zhang link->bs = nPetscReal; 674eb02082bSJunchao Zhang link->unitbytes = nPetscReal*sizeof(PetscReal); 67540e23c03SJunchao Zhang link->basicunit = MPIU_REAL; 6765ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 677b23bfdefSJunchao Zhang } else if (nPetscInt) { 678b23bfdefSJunchao Zhang if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 679b23bfdefSJunchao Zhang else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 680b23bfdefSJunchao Zhang else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 681b23bfdefSJunchao Zhang else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 682b23bfdefSJunchao Zhang link->bs = nPetscInt; 683eb02082bSJunchao Zhang link->unitbytes = nPetscInt*sizeof(PetscInt); 684b23bfdefSJunchao Zhang link->basicunit = MPIU_INT; 6855ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 686b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) 687b23bfdefSJunchao Zhang } else if (nInt) { 688b23bfdefSJunchao Zhang if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 689b23bfdefSJunchao Zhang else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 690b23bfdefSJunchao Zhang else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 691b23bfdefSJunchao Zhang else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 692b23bfdefSJunchao Zhang link->bs = nInt; 693eb02082bSJunchao Zhang link->unitbytes = nInt*sizeof(int); 694b23bfdefSJunchao Zhang link->basicunit = MPI_INT; 6955ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 696b23bfdefSJunchao Zhang #endif 697b23bfdefSJunchao Zhang } else if (nSignedChar) { 698b23bfdefSJunchao Zhang if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 699b23bfdefSJunchao Zhang else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 700b23bfdefSJunchao Zhang else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 701b23bfdefSJunchao Zhang else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 702b23bfdefSJunchao Zhang link->bs = nSignedChar; 703eb02082bSJunchao Zhang link->unitbytes = nSignedChar*sizeof(SignedChar); 704b23bfdefSJunchao Zhang link->basicunit = MPI_SIGNED_CHAR; 7055ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 706b23bfdefSJunchao Zhang } else if (nUnsignedChar) { 707b23bfdefSJunchao Zhang if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 708b23bfdefSJunchao Zhang else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 709b23bfdefSJunchao Zhang else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 710b23bfdefSJunchao Zhang else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 711b23bfdefSJunchao Zhang link->bs = nUnsignedChar; 712eb02082bSJunchao Zhang link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 713b23bfdefSJunchao Zhang link->basicunit = MPI_UNSIGNED_CHAR; 7145ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 71540e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 716b23bfdefSJunchao Zhang } else if (nPetscComplex) { 717b23bfdefSJunchao Zhang if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 718b23bfdefSJunchao Zhang else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 719b23bfdefSJunchao Zhang else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 720b23bfdefSJunchao Zhang else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 721b23bfdefSJunchao Zhang link->bs = nPetscComplex; 722eb02082bSJunchao Zhang link->unitbytes = nPetscComplex*sizeof(PetscComplex); 72340e23c03SJunchao Zhang link->basicunit = MPIU_COMPLEX; 7245ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 72540e23c03SJunchao Zhang #endif 72640e23c03SJunchao Zhang } else { 727b23bfdefSJunchao Zhang MPI_Aint lb,nbyte; 7289566063dSJacob Faibussowitsch PetscCallMPI(MPI_Type_get_extent(unit,&lb,&nbyte)); 729*08401ef6SPierre Jolivet PetscCheck(lb == 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb); 730eb02082bSJunchao Zhang if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 731eb02082bSJunchao Zhang if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 732eb02082bSJunchao Zhang else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 733eb02082bSJunchao Zhang else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 734eb02082bSJunchao Zhang link->bs = nbyte; 735b23bfdefSJunchao Zhang link->unitbytes = nbyte; 736b23bfdefSJunchao Zhang link->basicunit = MPI_BYTE; 73740e23c03SJunchao Zhang } else { 738eb02082bSJunchao Zhang nInt = nbyte / sizeof(int); 739eb02082bSJunchao Zhang if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 740eb02082bSJunchao Zhang else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 741eb02082bSJunchao Zhang else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 742eb02082bSJunchao Zhang else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 743eb02082bSJunchao Zhang link->bs = nInt; 744b23bfdefSJunchao Zhang link->unitbytes = nbyte; 74540e23c03SJunchao Zhang link->basicunit = MPI_INT; 74640e23c03SJunchao Zhang } 7475ad15460SJunchao Zhang if (link->isbuiltin) link->unit = unit; 74840e23c03SJunchao Zhang } 749b23bfdefSJunchao Zhang 7509566063dSJacob Faibussowitsch if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit,&link->unit)); 75120c24465SJunchao Zhang 75220c24465SJunchao Zhang link->Memcpy = PetscSFLinkMemcpy_Host; 75340e23c03SJunchao Zhang PetscFunctionReturn(0); 75440e23c03SJunchao Zhang } 75540e23c03SJunchao Zhang 756fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 75740e23c03SJunchao Zhang { 75840e23c03SJunchao Zhang PetscFunctionBegin; 75940e23c03SJunchao Zhang *UnpackAndOp = NULL; 76071438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) { 76183df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 762eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 763eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 764eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 765eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 766eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 767eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 768eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 769eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 770eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 771eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 772eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 773eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 774eb02082bSJunchao Zhang } 7757fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 77671438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) { 77783df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 778eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 779eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 780eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 781eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 782eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 783eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 784eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 785eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 786eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 787eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 788eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 789eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 79071438e86SJunchao Zhang } else if (PetscMemTypeDevice(mtype) && atomic) { 79183df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 792eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 793eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 794eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 795eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 796eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 797eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 798eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 799eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 800eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 801eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 802eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 803eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 804eb02082bSJunchao Zhang } 805eb02082bSJunchao Zhang #endif 80640e23c03SJunchao Zhang PetscFunctionReturn(0); 80740e23c03SJunchao Zhang } 80840e23c03SJunchao Zhang 809fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 81040e23c03SJunchao Zhang { 81140e23c03SJunchao Zhang PetscFunctionBegin; 812cd620004SJunchao Zhang *ScatterAndOp = NULL; 81371438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) { 81483df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 815cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 816cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 817cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 818cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 819cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 820cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 821cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 822cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 823cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 824cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 825cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 826cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 827eb02082bSJunchao Zhang } 8287fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 82971438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) { 83083df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 831cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 832cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 833cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 834cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 835cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 836cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 837cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 838cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 839cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 840cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 841cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 842cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 84371438e86SJunchao Zhang } else if (PetscMemTypeDevice(mtype) && atomic) { 84483df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 845cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 846cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 847cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 848cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 849cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 850cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 851cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 852cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 853cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 854cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 855cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 856cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 857eb02082bSJunchao Zhang } 858eb02082bSJunchao Zhang #endif 859cd620004SJunchao Zhang PetscFunctionReturn(0); 860cd620004SJunchao Zhang } 861cd620004SJunchao Zhang 862fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 863cd620004SJunchao Zhang { 864cd620004SJunchao Zhang PetscFunctionBegin; 865cd620004SJunchao Zhang *FetchAndOp = NULL; 8662c71b3e2SJacob Faibussowitsch PetscCheckFalse(op != MPI_SUM && op != MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 86771438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 8687fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 86971438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 87071438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 871cd620004SJunchao Zhang #endif 872cd620004SJunchao Zhang PetscFunctionReturn(0); 873cd620004SJunchao Zhang } 874cd620004SJunchao Zhang 875fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 876cd620004SJunchao Zhang { 877cd620004SJunchao Zhang PetscFunctionBegin; 878cd620004SJunchao Zhang *FetchAndOpLocal = NULL; 8792c71b3e2SJacob Faibussowitsch PetscCheckFalse(op != MPI_SUM && op != MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 88071438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 8817fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 88271438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 88371438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 884cd620004SJunchao Zhang #endif 885cd620004SJunchao Zhang PetscFunctionReturn(0); 886cd620004SJunchao Zhang } 887cd620004SJunchao Zhang 8889fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 889cd620004SJunchao Zhang { 890cd620004SJunchao Zhang PetscLogDouble flops; 891cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 892cd620004SJunchao Zhang 893cd620004SJunchao Zhang PetscFunctionBegin; 89483df288dSJunchao Zhang if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 895cd620004SJunchao Zhang flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 8967fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 8979566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops)); else 898cd620004SJunchao Zhang #endif 8999566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 900cd620004SJunchao Zhang } 901cd620004SJunchao Zhang PetscFunctionReturn(0); 902cd620004SJunchao Zhang } 903cd620004SJunchao Zhang 9049fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 905cd620004SJunchao Zhang { 906cd620004SJunchao Zhang PetscLogDouble flops; 907cd620004SJunchao Zhang 908cd620004SJunchao Zhang PetscFunctionBegin; 90983df288dSJunchao Zhang if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 910cd620004SJunchao Zhang flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 9117fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 9129566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops)); else 913cd620004SJunchao Zhang #endif 9149566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 915cd620004SJunchao Zhang } 916cd620004SJunchao Zhang PetscFunctionReturn(0); 917cd620004SJunchao Zhang } 918cd620004SJunchao Zhang 919cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 9204165533cSJose E. Roman Input Parameters: 921cd620004SJunchao Zhang +sf - The StarForest 922cd620004SJunchao Zhang .link - The link 923cd620004SJunchao Zhang .count - Number of entries to unpack 924cd620004SJunchao Zhang .start - The first index, significent when indices=NULL 925cd620004SJunchao Zhang .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 926cd620004SJunchao Zhang .buf - A contiguous buffer to unpack from 927cd620004SJunchao Zhang -op - Operation after unpack 928cd620004SJunchao Zhang 9294165533cSJose E. Roman Output Parameters: 930cd620004SJunchao Zhang .data - The data to unpack to 931cd620004SJunchao Zhang */ 9329fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 933cd620004SJunchao Zhang { 934cd620004SJunchao Zhang PetscFunctionBegin; 935cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 936cd620004SJunchao Zhang { 937cd620004SJunchao Zhang PetscInt i; 938cd620004SJunchao Zhang if (indices) { 939cd620004SJunchao Zhang /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 940cd620004SJunchao Zhang basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 941cd620004SJunchao Zhang */ 9429566063dSJacob Faibussowitsch for (i=0; i<count; i++) PetscCallMPI(MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op)); 943cd620004SJunchao Zhang } else { 9449566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Reduce_local(buf,(char*)data+start*link->unitbytes,count,link->unit,op)); 945cd620004SJunchao Zhang } 946cd620004SJunchao Zhang } 947b458e8f1SJose E. Roman PetscFunctionReturn(0); 948cd620004SJunchao Zhang #else 949cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 950cd620004SJunchao Zhang #endif 951cd620004SJunchao Zhang } 952cd620004SJunchao Zhang 9539fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 954cd620004SJunchao Zhang { 955cd620004SJunchao Zhang PetscFunctionBegin; 956cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 957cd620004SJunchao Zhang { 958cd620004SJunchao Zhang PetscInt i,disp; 959fcc7397dSJunchao Zhang if (!srcIdx) { 9609566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op)); 961fcc7397dSJunchao Zhang } else { 962cd620004SJunchao Zhang for (i=0; i<count; i++) { 963fcc7397dSJunchao Zhang disp = dstIdx? dstIdx[i] : dstStart + i; 9649566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op)); 965fcc7397dSJunchao Zhang } 966cd620004SJunchao Zhang } 967cd620004SJunchao Zhang } 968b458e8f1SJose E. Roman PetscFunctionReturn(0); 969cd620004SJunchao Zhang #else 970cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 971cd620004SJunchao Zhang #endif 972cd620004SJunchao Zhang } 973cd620004SJunchao Zhang 974cd620004SJunchao Zhang /*============================================================================= 975cd620004SJunchao Zhang Pack/Unpack/Fetch/Scatter routines 976cd620004SJunchao Zhang ============================================================================*/ 977cd620004SJunchao Zhang 978cd620004SJunchao Zhang /* Pack rootdata to rootbuf 9794165533cSJose E. Roman Input Parameters: 980cd620004SJunchao Zhang + sf - The SF this packing works on. 981cd620004SJunchao Zhang . link - It gives the memtype of the roots and also provides root buffer. 982cd620004SJunchao Zhang . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 983cd620004SJunchao Zhang - rootdata - Where to read the roots. 984cd620004SJunchao Zhang 985cd620004SJunchao Zhang Notes: 986cd620004SJunchao Zhang When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 98771438e86SJunchao Zhang in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 988cd620004SJunchao Zhang */ 98971438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 990cd620004SJunchao Zhang { 991cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 992cd620004SJunchao Zhang PetscInt count,start; 993fcc7397dSJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 994cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 995fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 996fcc7397dSJunchao Zhang 997cd620004SJunchao Zhang PetscFunctionBegin; 9989566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0)); 99971438e86SJunchao Zhang if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 10009566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices)); 10019566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetPack(link,rootmtype,&Pack)); 10029566063dSJacob Faibussowitsch PetscCall((*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype])); 1003cd620004SJunchao Zhang } 10049566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0)); 1005cd620004SJunchao Zhang PetscFunctionReturn(0); 1006cd620004SJunchao Zhang } 1007cd620004SJunchao Zhang 1008cd620004SJunchao Zhang /* Pack leafdata to leafbuf */ 100971438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1010cd620004SJunchao Zhang { 1011cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 1012cd620004SJunchao Zhang PetscInt count,start; 1013fcc7397dSJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1014cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 1015fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1016cd620004SJunchao Zhang 1017cd620004SJunchao Zhang PetscFunctionBegin; 10189566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0)); 101971438e86SJunchao Zhang if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 10209566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices)); 10219566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetPack(link,leafmtype,&Pack)); 10229566063dSJacob Faibussowitsch PetscCall((*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype])); 1023cd620004SJunchao Zhang } 10249566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0)); 1025cd620004SJunchao Zhang PetscFunctionReturn(0); 1026cd620004SJunchao Zhang } 1027cd620004SJunchao Zhang 102871438e86SJunchao Zhang /* Pack rootdata to rootbuf, which are in the same memory space */ 102971438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 103071438e86SJunchao Zhang { 103171438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 103271438e86SJunchao Zhang 103371438e86SJunchao Zhang PetscFunctionBegin; 103471438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 10359566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 10369566063dSJacob Faibussowitsch if (link->PrePack) PetscCall((*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 103771438e86SJunchao Zhang } 10389566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0)); 10399566063dSJacob Faibussowitsch if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf,link,scope,rootdata)); 10409566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0)); 104171438e86SJunchao Zhang PetscFunctionReturn(0); 104271438e86SJunchao Zhang } 104371438e86SJunchao Zhang /* Pack leafdata to leafbuf, which are in the same memory space */ 104471438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 104571438e86SJunchao Zhang { 104671438e86SJunchao Zhang PetscFunctionBegin; 104771438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { 10489566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 10499566063dSJacob Faibussowitsch if (link->PrePack) PetscCall((*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 105071438e86SJunchao Zhang } 10519566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0)); 10529566063dSJacob Faibussowitsch if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata)); 10539566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0)); 105471438e86SJunchao Zhang PetscFunctionReturn(0); 105571438e86SJunchao Zhang } 105671438e86SJunchao Zhang 105771438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1058cd620004SJunchao Zhang { 1059cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1060cd620004SJunchao Zhang PetscInt count,start; 1061cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1062fcc7397dSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1063cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1064fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1065cd620004SJunchao Zhang 1066cd620004SJunchao Zhang PetscFunctionBegin; 106771438e86SJunchao Zhang if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 10689566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp)); 1069cd620004SJunchao Zhang if (UnpackAndOp) { 10709566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices)); 10719566063dSJacob Faibussowitsch PetscCall((*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype])); 1072cd620004SJunchao Zhang } else { 10739566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices)); 10749566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op)); 1075cd620004SJunchao Zhang } 1076cd620004SJunchao Zhang } 10779566063dSJacob Faibussowitsch PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op)); 1078cd620004SJunchao Zhang PetscFunctionReturn(0); 1079cd620004SJunchao Zhang } 1080cd620004SJunchao Zhang 108171438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1082cd620004SJunchao Zhang { 1083cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 1084cd620004SJunchao Zhang PetscInt count,start; 1085fcc7397dSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1086cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 1087fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1088cd620004SJunchao Zhang 1089cd620004SJunchao Zhang PetscFunctionBegin; 109071438e86SJunchao Zhang if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 10919566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp)); 1092cd620004SJunchao Zhang if (UnpackAndOp) { 10939566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices)); 10949566063dSJacob Faibussowitsch PetscCall((*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype])); 1095cd620004SJunchao Zhang } else { 10969566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices)); 10979566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op)); 1098cd620004SJunchao Zhang } 1099cd620004SJunchao Zhang } 11009566063dSJacob Faibussowitsch PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op)); 110171438e86SJunchao Zhang PetscFunctionReturn(0); 110271438e86SJunchao Zhang } 110371438e86SJunchao Zhang /* Unpack rootbuf to rootdata, which are in the same memory space */ 110471438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 110571438e86SJunchao Zhang { 110671438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 110771438e86SJunchao Zhang 110871438e86SJunchao Zhang PetscFunctionBegin; 11099566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0)); 11109566063dSJacob Faibussowitsch if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op)); 11119566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0)); 111271438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { 11139566063dSJacob Faibussowitsch if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 11149566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 111571438e86SJunchao Zhang } 1116cd620004SJunchao Zhang PetscFunctionReturn(0); 1117cd620004SJunchao Zhang } 1118cd620004SJunchao Zhang 111971438e86SJunchao Zhang /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 112071438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 112171438e86SJunchao Zhang { 112271438e86SJunchao Zhang PetscFunctionBegin; 11239566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0)); 11249566063dSJacob Faibussowitsch if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op)); 11259566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0)); 112671438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { 11279566063dSJacob Faibussowitsch if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 11289566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 112971438e86SJunchao Zhang } 113071438e86SJunchao Zhang PetscFunctionReturn(0); 113171438e86SJunchao Zhang } 113271438e86SJunchao Zhang 113371438e86SJunchao Zhang /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 113471438e86SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op) 1135cd620004SJunchao Zhang { 1136cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1137cd620004SJunchao Zhang PetscInt count,start; 1138cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1139fcc7397dSJunchao Zhang PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1140cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1141fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1142cd620004SJunchao Zhang 1143cd620004SJunchao Zhang PetscFunctionBegin; 11449566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0)); 114571438e86SJunchao Zhang if (bas->rootbuflen[PETSCSF_REMOTE]) { 1146cd620004SJunchao Zhang /* Do FetchAndOp on rootdata with rootbuf */ 11479566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp)); 11489566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices)); 11499566063dSJacob Faibussowitsch PetscCall((*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype])); 1150cd620004SJunchao Zhang } 11519566063dSJacob Faibussowitsch PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op)); 11529566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0)); 1153cd620004SJunchao Zhang PetscFunctionReturn(0); 1154cd620004SJunchao Zhang } 1155cd620004SJunchao Zhang 115671438e86SJunchao Zhang PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op) 1157cd620004SJunchao Zhang { 1158cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1159cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1160cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1161fcc7397dSJunchao Zhang PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 116271438e86SJunchao Zhang PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype; 1163fcc7397dSJunchao Zhang PetscSFPackOpt leafopt = NULL,rootopt = NULL; 116471438e86SJunchao Zhang PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 116571438e86SJunchao Zhang char *srcbuf = NULL,*dstbuf = NULL; 116671438e86SJunchao Zhang PetscBool dstdups; 1167cd620004SJunchao Zhang 1168362febeeSStefano Zampini PetscFunctionBegin; 116971438e86SJunchao Zhang if (!buflen) PetscFunctionReturn(0); 117071438e86SJunchao Zhang if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 117171438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 11729566063dSJacob Faibussowitsch PetscCall(PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata)); 117371438e86SJunchao Zhang srcmtype = rootmtype; 117471438e86SJunchao Zhang srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 117571438e86SJunchao Zhang dstmtype = leafmtype; 117671438e86SJunchao Zhang dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 117771438e86SJunchao Zhang } else { 11789566063dSJacob Faibussowitsch PetscCall(PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata)); 117971438e86SJunchao Zhang srcmtype = leafmtype; 118071438e86SJunchao Zhang srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 118171438e86SJunchao Zhang dstmtype = rootmtype; 118271438e86SJunchao Zhang dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 118371438e86SJunchao Zhang } 11849566063dSJacob Faibussowitsch PetscCall((*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes)); 118571438e86SJunchao Zhang /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 11869566063dSJacob Faibussowitsch if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link)); 118771438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 11889566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op)); 1189cd620004SJunchao Zhang } else { 11909566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op)); 119171438e86SJunchao Zhang } 119271438e86SJunchao Zhang } else { 119371438e86SJunchao Zhang dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 119471438e86SJunchao Zhang dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 11959566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp)); 1196cd620004SJunchao Zhang if (ScatterAndOp) { 11979566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices)); 11989566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices)); 119971438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 12009566063dSJacob Faibussowitsch PetscCall((*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata)); 1201cd620004SJunchao Zhang } else { 12029566063dSJacob Faibussowitsch PetscCall((*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata)); 120371438e86SJunchao Zhang } 1204cd620004SJunchao Zhang } else { 12059566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices)); 12069566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices)); 120771438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 12089566063dSJacob Faibussowitsch PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op)); 120971438e86SJunchao Zhang } else { 12109566063dSJacob Faibussowitsch PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op)); 1211cd620004SJunchao Zhang } 1212cd620004SJunchao Zhang } 121371438e86SJunchao Zhang } 1214cd620004SJunchao Zhang PetscFunctionReturn(0); 1215cd620004SJunchao Zhang } 1216cd620004SJunchao Zhang 1217cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */ 1218cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1219cd620004SJunchao Zhang { 1220cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1221cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1222cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1223fcc7397dSJunchao Zhang PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1224cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1225fcc7397dSJunchao Zhang PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1226cd620004SJunchao Zhang 1227cd620004SJunchao Zhang PetscFunctionBegin; 1228cd620004SJunchao Zhang if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1229cd620004SJunchao Zhang if (rootmtype != leafmtype) { 1230cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */ 1231cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1232cd620004SJunchao Zhang } else { 12339566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices)); 12349566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices)); 12359566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal)); 12369566063dSJacob Faibussowitsch PetscCall((*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate)); 1237cd620004SJunchao Zhang } 123840e23c03SJunchao Zhang PetscFunctionReturn(0); 123940e23c03SJunchao Zhang } 124040e23c03SJunchao Zhang 124140e23c03SJunchao Zhang /* 1242cd620004SJunchao Zhang Create per-rank pack/unpack optimizations based on indice patterns 124340e23c03SJunchao Zhang 124440e23c03SJunchao Zhang Input Parameters: 1245fcc7397dSJunchao Zhang + n - Number of destination ranks 1246eb02082bSJunchao Zhang . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1247b23bfdefSJunchao Zhang - idx - [*] Array storing indices 124840e23c03SJunchao Zhang 124940e23c03SJunchao Zhang Output Parameters: 1250cd620004SJunchao Zhang + opt - Pack optimizations. NULL if no optimizations. 125140e23c03SJunchao Zhang */ 1252cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 125340e23c03SJunchao Zhang { 1254fcc7397dSJunchao Zhang PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1255fcc7397dSJunchao Zhang PetscBool optimizable = PETSC_TRUE; 125640e23c03SJunchao Zhang PetscSFPackOpt opt; 125740e23c03SJunchao Zhang 125840e23c03SJunchao Zhang PetscFunctionBegin; 12599566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1,&opt)); 12609566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(7*n+2,&opt->array)); 1261fcc7397dSJunchao Zhang opt->n = opt->array[0] = n; 1262fcc7397dSJunchao Zhang opt->offset = opt->array + 1; 1263fcc7397dSJunchao Zhang opt->start = opt->array + n + 2; 1264fcc7397dSJunchao Zhang opt->dx = opt->array + 2*n + 2; 1265fcc7397dSJunchao Zhang opt->dy = opt->array + 3*n + 2; 1266fcc7397dSJunchao Zhang opt->dz = opt->array + 4*n + 2; 1267fcc7397dSJunchao Zhang opt->X = opt->array + 5*n + 2; 1268fcc7397dSJunchao Zhang opt->Y = opt->array + 6*n + 2; 1269fcc7397dSJunchao Zhang 1270fcc7397dSJunchao Zhang for (r=0; r<n; r++) { /* For each destination rank */ 1271fcc7397dSJunchao Zhang m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1272fcc7397dSJunchao Zhang p = offset[r]; 1273fcc7397dSJunchao Zhang start = idx[p]; /* First index for this rank */ 1274fcc7397dSJunchao Zhang p++; 1275fcc7397dSJunchao Zhang 1276fcc7397dSJunchao Zhang /* Search in X dimension */ 1277fcc7397dSJunchao Zhang for (dx=1; dx<m; dx++,p++) { 1278fcc7397dSJunchao Zhang if (start+dx != idx[p]) break; 1279b23bfdefSJunchao Zhang } 1280b23bfdefSJunchao Zhang 1281fcc7397dSJunchao Zhang dydz = m/dx; 1282fcc7397dSJunchao Zhang X = dydz > 1 ? (idx[p]-start) : dx; 1283fcc7397dSJunchao Zhang /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1284fcc7397dSJunchao Zhang if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1285fcc7397dSJunchao Zhang for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1286fcc7397dSJunchao Zhang for (i=0; i<dx; i++,p++) { 1287fcc7397dSJunchao Zhang if (start+X*dy+i != idx[p]) { 1288fcc7397dSJunchao Zhang if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1289fcc7397dSJunchao Zhang else goto Z_dimension; 129040e23c03SJunchao Zhang } 129140e23c03SJunchao Zhang } 129240e23c03SJunchao Zhang } 129340e23c03SJunchao Zhang 1294fcc7397dSJunchao Zhang Z_dimension: 1295fcc7397dSJunchao Zhang dz = m/(dx*dy); 1296fcc7397dSJunchao Zhang Y = dz > 1 ? (idx[p]-start)/X : dy; 1297fcc7397dSJunchao Zhang /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1298fcc7397dSJunchao Zhang if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1299fcc7397dSJunchao Zhang for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1300fcc7397dSJunchao Zhang for (j=0; j<dy; j++) { 1301fcc7397dSJunchao Zhang for (i=0; i<dx; i++,p++) { 1302fcc7397dSJunchao Zhang if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 130340e23c03SJunchao Zhang } 130440e23c03SJunchao Zhang } 130540e23c03SJunchao Zhang } 1306fcc7397dSJunchao Zhang opt->start[r] = start; 1307fcc7397dSJunchao Zhang opt->dx[r] = dx; 1308fcc7397dSJunchao Zhang opt->dy[r] = dy; 1309fcc7397dSJunchao Zhang opt->dz[r] = dz; 1310fcc7397dSJunchao Zhang opt->X[r] = X; 1311fcc7397dSJunchao Zhang opt->Y[r] = Y; 131240e23c03SJunchao Zhang } 131340e23c03SJunchao Zhang 1314fcc7397dSJunchao Zhang finish: 1315fcc7397dSJunchao Zhang /* If not optimizable, free arrays to save memory */ 1316fcc7397dSJunchao Zhang if (!n || !optimizable) { 13179566063dSJacob Faibussowitsch PetscCall(PetscFree(opt->array)); 13189566063dSJacob Faibussowitsch PetscCall(PetscFree(opt)); 131940e23c03SJunchao Zhang *out = NULL; 1320fcc7397dSJunchao Zhang } else { 1321fcc7397dSJunchao Zhang opt->offset[0] = 0; 1322fcc7397dSJunchao Zhang for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1323fcc7397dSJunchao Zhang *out = opt; 1324fcc7397dSJunchao Zhang } 132540e23c03SJunchao Zhang PetscFunctionReturn(0); 132640e23c03SJunchao Zhang } 132740e23c03SJunchao Zhang 13289fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) 132940e23c03SJunchao Zhang { 133040e23c03SJunchao Zhang PetscSFPackOpt opt = *out; 133140e23c03SJunchao Zhang 133240e23c03SJunchao Zhang PetscFunctionBegin; 133340e23c03SJunchao Zhang if (opt) { 13349566063dSJacob Faibussowitsch PetscCall(PetscSFFree(sf,mtype,opt->array)); 13359566063dSJacob Faibussowitsch PetscCall(PetscFree(opt)); 133640e23c03SJunchao Zhang *out = NULL; 133740e23c03SJunchao Zhang } 133840e23c03SJunchao Zhang PetscFunctionReturn(0); 133940e23c03SJunchao Zhang } 1340cd620004SJunchao Zhang 1341cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1342cd620004SJunchao Zhang { 1343cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1344cd620004SJunchao Zhang PetscInt i,j; 1345cd620004SJunchao Zhang 1346cd620004SJunchao Zhang PetscFunctionBegin; 1347cd620004SJunchao Zhang /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1348cd620004SJunchao Zhang for (i=0; i<2; i++) { /* Set defaults */ 1349cd620004SJunchao Zhang sf->leafstart[i] = 0; 1350cd620004SJunchao Zhang sf->leafcontig[i] = PETSC_TRUE; 1351cd620004SJunchao Zhang sf->leafdups[i] = PETSC_FALSE; 1352cd620004SJunchao Zhang bas->rootstart[i] = 0; 1353cd620004SJunchao Zhang bas->rootcontig[i] = PETSC_TRUE; 1354cd620004SJunchao Zhang bas->rootdups[i] = PETSC_FALSE; 1355cd620004SJunchao Zhang } 1356cd620004SJunchao Zhang 1357cd620004SJunchao Zhang sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1358cd620004SJunchao Zhang sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1359cd620004SJunchao Zhang 1360cd620004SJunchao Zhang if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1361cd620004SJunchao Zhang if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1362cd620004SJunchao Zhang 1363cd620004SJunchao Zhang /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1364cd620004SJunchao Zhang for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1365cd620004SJunchao Zhang if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1366cd620004SJunchao Zhang } 1367cd620004SJunchao Zhang for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1368cd620004SJunchao Zhang if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1369cd620004SJunchao Zhang } 1370cd620004SJunchao Zhang 1371cd620004SJunchao Zhang /* If not, see if we can have per-rank optimizations by doing index analysis */ 13729566063dSJacob Faibussowitsch if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0])); 13739566063dSJacob Faibussowitsch if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1])); 1374cd620004SJunchao Zhang 1375cd620004SJunchao Zhang /* Are root indices for self and remote contiguous? */ 1376cd620004SJunchao Zhang bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1377cd620004SJunchao Zhang bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1378cd620004SJunchao Zhang 1379cd620004SJunchao Zhang if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1380cd620004SJunchao Zhang if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1381cd620004SJunchao Zhang 1382cd620004SJunchao Zhang for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1383cd620004SJunchao Zhang if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1384cd620004SJunchao Zhang } 1385cd620004SJunchao Zhang for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1386cd620004SJunchao Zhang if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1387cd620004SJunchao Zhang } 1388cd620004SJunchao Zhang 13899566063dSJacob Faibussowitsch if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0])); 13909566063dSJacob Faibussowitsch if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1])); 1391cd620004SJunchao Zhang 1392cd620004SJunchao Zhang /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1393013b3241SStefano Zampini if (PetscDefined(HAVE_DEVICE)) { 1394013b3241SStefano Zampini PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 13959566063dSJacob Faibussowitsch if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0])); 13969566063dSJacob Faibussowitsch if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1])); 13979566063dSJacob Faibussowitsch if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0])); 13989566063dSJacob Faibussowitsch if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1])); 1399013b3241SStefano Zampini } 1400cd620004SJunchao Zhang PetscFunctionReturn(0); 1401cd620004SJunchao Zhang } 1402cd620004SJunchao Zhang 1403cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1404cd620004SJunchao Zhang { 1405cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1406cd620004SJunchao Zhang PetscInt i; 1407cd620004SJunchao Zhang 1408cd620004SJunchao Zhang PetscFunctionBegin; 1409cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 14109566063dSJacob Faibussowitsch PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i])); 14119566063dSJacob Faibussowitsch PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i])); 14127fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 14139566063dSJacob Faibussowitsch PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i])); 14149566063dSJacob Faibussowitsch PetscCall(PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i])); 14157fd2d3dbSJunchao Zhang #endif 1416cd620004SJunchao Zhang } 1417cd620004SJunchao Zhang PetscFunctionReturn(0); 1418cd620004SJunchao Zhang } 1419