120c24465SJunchao Zhang #include "petsc/private/sfimpl.h" 240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 440e23c03SJunchao Zhang 57fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 67fd2d3dbSJunchao Zhang 740e23c03SJunchao Zhang /* 840e23c03SJunchao Zhang * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 940e23c03SJunchao Zhang * therefore we pack data types manually. This file defines packing routines for the standard data types. 1040e23c03SJunchao Zhang */ 1140e23c03SJunchao Zhang 12cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 1340e23c03SJunchao Zhang 14cd620004SJunchao Zhang /* Operations working like s += t */ 15cd620004SJunchao Zhang #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 16cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 17cd620004SJunchao Zhang #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 18cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 19cd620004SJunchao Zhang /* Ref MPI MAXLOC */ 20cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \ 21cd620004SJunchao Zhang do { \ 22cd620004SJunchao Zhang if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 23cd620004SJunchao Zhang else if (!((s).u op (t).u)) s = t; \ 24cd620004SJunchao Zhang } while (0) 2540e23c03SJunchao Zhang 2640e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine 2740e23c03SJunchao Zhang 2840e23c03SJunchao Zhang Arguments of the macro: 29b23bfdefSJunchao Zhang +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 30fcc7397dSJunchao Zhang .BS Block size for vectorization. It is a factor of bsz. 31b23bfdefSJunchao Zhang -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 3240e23c03SJunchao Zhang 3340e23c03SJunchao Zhang Arguments of the Pack routine: 34cd620004SJunchao Zhang +count Number of indices in idx[]. 35fcc7397dSJunchao Zhang .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 36fcc7397dSJunchao Zhang .opt Per-pack optimization plan. NULL means no such plan. 37fcc7397dSJunchao Zhang .idx Indices of entries to packed. 38eb02082bSJunchao Zhang .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39cd620004SJunchao Zhang .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 40cd620004SJunchao Zhang -packed Address of the packed data. 4140e23c03SJunchao Zhang */ 42b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \ 43fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 44b23bfdefSJunchao Zhang { \ 4540e23c03SJunchao Zhang PetscErrorCode ierr; \ 46b23bfdefSJunchao Zhang const Type *u = (const Type*)unpacked,*u2; \ 47b23bfdefSJunchao Zhang Type *p = (Type*)packed,*p2; \ 48fcc7397dSJunchao Zhang PetscInt i,j,k,X,Y,r,bs=link->bs; \ 49fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 50b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 5140e23c03SJunchao Zhang PetscFunctionBegin; \ 52fcc7397dSJunchao Zhang if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 53fcc7397dSJunchao Zhang else if (opt) { /* has optimizations available */ \ 54fcc7397dSJunchao Zhang p2 = p; \ 55fcc7397dSJunchao Zhang for (r=0; r<opt->n; r++) { \ 56fcc7397dSJunchao Zhang u2 = u + opt->start[r]*MBS; \ 57fcc7397dSJunchao Zhang X = opt->X[r]; \ 58fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 59fcc7397dSJunchao Zhang for (k=0; k<opt->dz[r]; k++) \ 60fcc7397dSJunchao Zhang for (j=0; j<opt->dy[r]; j++) { \ 61fcc7397dSJunchao Zhang ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 62fcc7397dSJunchao Zhang p2 += opt->dx[r]*MBS; \ 63fcc7397dSJunchao Zhang } \ 64fcc7397dSJunchao Zhang } \ 65fcc7397dSJunchao Zhang } else { \ 66b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 67eb02082bSJunchao Zhang for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 68eb02082bSJunchao Zhang for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 69b23bfdefSJunchao Zhang p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 7040e23c03SJunchao Zhang } \ 7140e23c03SJunchao Zhang PetscFunctionReturn(0); \ 7240e23c03SJunchao Zhang } 7340e23c03SJunchao Zhang 74cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 75cd620004SJunchao Zhang and inserts into a sparse array. 7640e23c03SJunchao Zhang 7740e23c03SJunchao Zhang Arguments: 78b23bfdefSJunchao Zhang .Type Type of the data 7940e23c03SJunchao Zhang .BS Block size for vectorization 80b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 8140e23c03SJunchao Zhang 8240e23c03SJunchao Zhang Notes: 8340e23c03SJunchao Zhang This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 8440e23c03SJunchao Zhang */ 85cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ) \ 86fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 87b23bfdefSJunchao Zhang { \ 8840e23c03SJunchao Zhang PetscErrorCode ierr; \ 89b23bfdefSJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 90fcc7397dSJunchao Zhang const Type *p = (const Type*)packed; \ 91fcc7397dSJunchao Zhang PetscInt i,j,k,X,Y,r,bs=link->bs; \ 92fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 93b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 9440e23c03SJunchao Zhang PetscFunctionBegin; \ 95b23bfdefSJunchao Zhang if (!idx) { \ 96fcc7397dSJunchao Zhang u += start*MBS; \ 97fcc7397dSJunchao Zhang if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 98fcc7397dSJunchao Zhang } else if (opt) { /* has optimizations available */ \ 99fcc7397dSJunchao Zhang for (r=0; r<opt->n; r++) { \ 100fcc7397dSJunchao Zhang u2 = u + opt->start[r]*MBS; \ 101fcc7397dSJunchao Zhang X = opt->X[r]; \ 102fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 103fcc7397dSJunchao Zhang for (k=0; k<opt->dz[r]; k++) \ 104fcc7397dSJunchao Zhang for (j=0; j<opt->dy[r]; j++) { \ 105fcc7397dSJunchao Zhang ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 106fcc7397dSJunchao Zhang p += opt->dx[r]*MBS; \ 107fcc7397dSJunchao Zhang } \ 108fcc7397dSJunchao Zhang } \ 109fcc7397dSJunchao Zhang } else { \ 110b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 111b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 112cd620004SJunchao Zhang for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 11340e23c03SJunchao Zhang } \ 11440e23c03SJunchao Zhang PetscFunctionReturn(0); \ 11540e23c03SJunchao Zhang } 11640e23c03SJunchao Zhang 117cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 11840e23c03SJunchao Zhang 11940e23c03SJunchao Zhang Arguments: 120cd620004SJunchao Zhang +Opname Name of the Op, such as Add, Mult, LAND, etc. 121b23bfdefSJunchao Zhang .Type Type of the data 12240e23c03SJunchao Zhang .BS Block size for vectorization 123b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 124cd620004SJunchao Zhang .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 125cd620004SJunchao Zhang .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 12640e23c03SJunchao Zhang */ 127cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 128fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 129b23bfdefSJunchao Zhang { \ 130cd620004SJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 131fcc7397dSJunchao Zhang const Type *p = (const Type*)packed; \ 132fcc7397dSJunchao Zhang PetscInt i,j,k,X,Y,r,bs=link->bs; \ 133fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 134b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 13540e23c03SJunchao Zhang PetscFunctionBegin; \ 136b23bfdefSJunchao Zhang if (!idx) { \ 137fcc7397dSJunchao Zhang u += start*MBS; \ 138cd620004SJunchao Zhang for (i=0; i<count; i++) \ 139cd620004SJunchao Zhang for (j=0; j<M; j++) \ 140cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 141cd620004SJunchao Zhang OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 142fcc7397dSJunchao Zhang } else if (opt) { /* idx[] has patterns */ \ 143fcc7397dSJunchao Zhang for (r=0; r<opt->n; r++) { \ 144fcc7397dSJunchao Zhang u2 = u + opt->start[r]*MBS; \ 145fcc7397dSJunchao Zhang X = opt->X[r]; \ 146fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 147fcc7397dSJunchao Zhang for (k=0; k<opt->dz[r]; k++) \ 148fcc7397dSJunchao Zhang for (j=0; j<opt->dy[r]; j++) { \ 149fcc7397dSJunchao Zhang for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 150fcc7397dSJunchao Zhang p += opt->dx[r]*MBS; \ 151fcc7397dSJunchao Zhang } \ 152fcc7397dSJunchao Zhang } \ 153fcc7397dSJunchao Zhang } else { \ 154cd620004SJunchao Zhang for (i=0; i<count; i++) \ 155cd620004SJunchao Zhang for (j=0; j<M; j++) \ 156cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 157cd620004SJunchao Zhang OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 158cd620004SJunchao Zhang } \ 159cd620004SJunchao Zhang PetscFunctionReturn(0); \ 160cd620004SJunchao Zhang } 161cd620004SJunchao Zhang 162cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 163fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 164cd620004SJunchao Zhang { \ 165fcc7397dSJunchao Zhang Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 166fcc7397dSJunchao Zhang PetscInt i,j,k,r,l,bs=link->bs; \ 167fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 168fcc7397dSJunchao Zhang const PetscInt MBS = M*BS; \ 169cd620004SJunchao Zhang PetscFunctionBegin; \ 170fcc7397dSJunchao Zhang for (i=0; i<count; i++) { \ 171fcc7397dSJunchao Zhang r = (!idx ? start+i : idx[i])*MBS; \ 172fcc7397dSJunchao Zhang l = i*MBS; \ 173b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 174b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 175fcc7397dSJunchao Zhang tmp = u[r+j*BS+k]; \ 176fcc7397dSJunchao Zhang OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 177fcc7397dSJunchao Zhang p[l+j*BS+k] = tmp; \ 178cd620004SJunchao Zhang } \ 179cd620004SJunchao Zhang } \ 180cd620004SJunchao Zhang PetscFunctionReturn(0); \ 181cd620004SJunchao Zhang } 182cd620004SJunchao Zhang 183cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 184fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 185cd620004SJunchao Zhang { \ 186fcc7397dSJunchao Zhang PetscErrorCode ierr; \ 187fcc7397dSJunchao Zhang const Type *u = (const Type*)src; \ 188fcc7397dSJunchao Zhang Type *v = (Type*)dst; \ 189fcc7397dSJunchao Zhang PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 190cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 191cd620004SJunchao Zhang const PetscInt MBS = M*BS; \ 192cd620004SJunchao Zhang PetscFunctionBegin; \ 193fcc7397dSJunchao Zhang if (!srcIdx) { /* src is contiguous */ \ 194fcc7397dSJunchao Zhang u += srcStart*MBS; \ 195fcc7397dSJunchao Zhang ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 196fcc7397dSJunchao Zhang } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 197fcc7397dSJunchao Zhang u += srcOpt->start[0]*MBS; \ 198fcc7397dSJunchao Zhang v += dstStart*MBS; \ 199fcc7397dSJunchao Zhang X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 200fcc7397dSJunchao Zhang for (k=0; k<srcOpt->dz[0]; k++) \ 201fcc7397dSJunchao Zhang for (j=0; j<srcOpt->dy[0]; j++) { \ 202fcc7397dSJunchao Zhang for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 203fcc7397dSJunchao Zhang v += srcOpt->dx[0]*MBS; \ 204fcc7397dSJunchao Zhang } \ 205fcc7397dSJunchao Zhang } else { /* all other cases */ \ 206fcc7397dSJunchao Zhang for (i=0; i<count; i++) { \ 207fcc7397dSJunchao Zhang s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 208fcc7397dSJunchao Zhang t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 209cd620004SJunchao Zhang for (j=0; j<M; j++) \ 210fcc7397dSJunchao Zhang for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 211fcc7397dSJunchao Zhang } \ 212cd620004SJunchao Zhang } \ 213cd620004SJunchao Zhang PetscFunctionReturn(0); \ 214cd620004SJunchao Zhang } 215cd620004SJunchao Zhang 216cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 217fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 218cd620004SJunchao Zhang { \ 219fcc7397dSJunchao Zhang Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 220fcc7397dSJunchao Zhang const Type *ldata = (const Type*)leafdata; \ 221fcc7397dSJunchao Zhang PetscInt i,j,k,r,l,bs = link->bs; \ 222cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 223cd620004SJunchao Zhang const PetscInt MBS = M*BS; \ 224cd620004SJunchao Zhang PetscFunctionBegin; \ 225fcc7397dSJunchao Zhang for (i=0; i<count; i++) { \ 226fcc7397dSJunchao Zhang r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 227fcc7397dSJunchao Zhang l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 228cd620004SJunchao Zhang for (j=0; j<M; j++) \ 229cd620004SJunchao Zhang for (k=0; k<BS; k++) { \ 230fcc7397dSJunchao Zhang lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 231fcc7397dSJunchao Zhang OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 23240e23c03SJunchao Zhang } \ 23340e23c03SJunchao Zhang } \ 23440e23c03SJunchao Zhang PetscFunctionReturn(0); \ 23540e23c03SJunchao Zhang } 23640e23c03SJunchao Zhang 237b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */ 238b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ) \ 239b23bfdefSJunchao Zhang DEF_PackFunc(Type,BS,EQ) \ 240cd620004SJunchao Zhang DEF_UnpackFunc(Type,BS,EQ) \ 241cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 242cd620004SJunchao Zhang static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 243eb02082bSJunchao Zhang link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 244eb02082bSJunchao Zhang link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 245cd620004SJunchao Zhang link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 24640e23c03SJunchao Zhang } 24740e23c03SJunchao Zhang 248b23bfdefSJunchao Zhang /* Add, Mult ops */ 249b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ) \ 250cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 251cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 252cd620004SJunchao Zhang DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 253cd620004SJunchao Zhang DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 254cd620004SJunchao Zhang DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 255cd620004SJunchao Zhang DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 256cd620004SJunchao Zhang static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 257eb02082bSJunchao Zhang link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 258eb02082bSJunchao Zhang link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 259eb02082bSJunchao Zhang link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 260cd620004SJunchao Zhang link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 261cd620004SJunchao Zhang link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 262cd620004SJunchao Zhang link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 26340e23c03SJunchao Zhang } 26440e23c03SJunchao Zhang 265b23bfdefSJunchao Zhang /* Max, Min ops */ 266b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ) \ 267cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 268cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 269cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 270cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 271cd620004SJunchao Zhang static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 272eb02082bSJunchao Zhang link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 273eb02082bSJunchao Zhang link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 274cd620004SJunchao Zhang link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 275cd620004SJunchao Zhang link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 276b23bfdefSJunchao Zhang } 277b23bfdefSJunchao Zhang 278b23bfdefSJunchao Zhang /* Logical ops. 279cd620004SJunchao Zhang The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 28040e23c03SJunchao Zhang the compilation warning "empty macro arguments are undefined in ISO C90" 28140e23c03SJunchao Zhang */ 282b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ) \ 283cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 284cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 285cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 286cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 287cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 288cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 289cd620004SJunchao Zhang static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 290eb02082bSJunchao Zhang link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 291eb02082bSJunchao Zhang link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 292eb02082bSJunchao Zhang link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 293cd620004SJunchao Zhang link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 294cd620004SJunchao Zhang link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 295cd620004SJunchao Zhang link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 29640e23c03SJunchao Zhang } 29740e23c03SJunchao Zhang 298b23bfdefSJunchao Zhang /* Bitwise ops */ 299b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ) \ 300cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 301cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 302cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 303cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 304cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 305cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 306cd620004SJunchao Zhang static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 307eb02082bSJunchao Zhang link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 308eb02082bSJunchao Zhang link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 309eb02082bSJunchao Zhang link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 310cd620004SJunchao Zhang link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 311cd620004SJunchao Zhang link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 312cd620004SJunchao Zhang link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 31340e23c03SJunchao Zhang } 31440e23c03SJunchao Zhang 315cd620004SJunchao Zhang /* Maxloc, Minloc ops */ 316cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ) \ 317cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 318cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 319cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 320cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 321cd620004SJunchao Zhang static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 322cd620004SJunchao Zhang link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 323cd620004SJunchao Zhang link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 324cd620004SJunchao Zhang link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 325cd620004SJunchao Zhang link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 32640e23c03SJunchao Zhang } 32740e23c03SJunchao Zhang 328b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ) \ 329b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 330b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 331b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 332b23bfdefSJunchao Zhang DEF_Log(Type,BS,EQ) \ 333b23bfdefSJunchao Zhang DEF_Bit(Type,BS,EQ) \ 334cd620004SJunchao Zhang static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 335b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 336b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 337b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 338b23bfdefSJunchao Zhang CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 339b23bfdefSJunchao Zhang CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 34040e23c03SJunchao Zhang } 34140e23c03SJunchao Zhang 342b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ) \ 343b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 344b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 345b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 346cd620004SJunchao Zhang static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 347b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 348b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 349b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 350b23bfdefSJunchao Zhang } 35140e23c03SJunchao Zhang 35240e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 353b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ) \ 354b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 355b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 356cd620004SJunchao Zhang static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 357b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 358b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 359b23bfdefSJunchao Zhang } 36040e23c03SJunchao Zhang #endif 361b23bfdefSJunchao Zhang 362b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ) \ 363b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 364cd620004SJunchao Zhang static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 365b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 366b23bfdefSJunchao Zhang } 367b23bfdefSJunchao Zhang 368b23bfdefSJunchao Zhang /* Maxloc, Minloc */ 369cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ) \ 370cd620004SJunchao Zhang DEF_Pack(Type,BS,EQ) \ 371cd620004SJunchao Zhang DEF_Xloc(Type,BS,EQ) \ 372cd620004SJunchao Zhang static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 373cd620004SJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374cd620004SJunchao Zhang CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 375b23bfdefSJunchao Zhang } 376b23bfdefSJunchao Zhang 377b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 378b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 379b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 380b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 381b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 382b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 383b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 384b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 385b23bfdefSJunchao Zhang 386b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 387b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1) 388b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1) 389b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1) 390b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1) 391b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0) 392b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0) 393b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0) 394b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0) 395b23bfdefSJunchao Zhang #endif 396b23bfdefSJunchao Zhang 397b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */ 398b23bfdefSJunchao Zhang typedef signed char SignedChar; 399b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1) 400b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1) 401b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1) 402b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1) 403b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0) 404b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0) 405b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0) 406b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0) 407b23bfdefSJunchao Zhang 408b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar; 409b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1) 410b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1) 411b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1) 412b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1) 413b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0) 414b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0) 415b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0) 416b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0) 417b23bfdefSJunchao Zhang 418b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1) 419b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1) 420b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1) 421b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1) 422b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0) 423b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0) 424b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0) 425b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0) 426b23bfdefSJunchao Zhang 427b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 428b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1) 429b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1) 430b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1) 431b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1) 432b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0) 433b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0) 434b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0) 435b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0) 436b23bfdefSJunchao Zhang #endif 437b23bfdefSJunchao Zhang 438cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2 439cd620004SJunchao Zhang typedef struct {int u; int i;} PairType(int,int); 440cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 441cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1) 442cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1) 443b23bfdefSJunchao Zhang 444b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */ 445b23bfdefSJunchao Zhang DEF_DumbType(char,1,1) 446b23bfdefSJunchao Zhang DEF_DumbType(char,2,1) 447b23bfdefSJunchao Zhang DEF_DumbType(char,4,1) 448b23bfdefSJunchao Zhang DEF_DumbType(char,1,0) 449b23bfdefSJunchao Zhang DEF_DumbType(char,2,0) 450b23bfdefSJunchao Zhang DEF_DumbType(char,4,0) 451b23bfdefSJunchao Zhang 452eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 453b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1) 454b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1) 455b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1) 456b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1) 457b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0) 458b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0) 459b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0) 460b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0) 46140e23c03SJunchao Zhang 46240e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 4639fbee547SJacob Faibussowitsch static inline int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 46440e23c03SJunchao Zhang { 46540e23c03SJunchao Zhang int ierr; 46640e23c03SJunchao Zhang ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 46740e23c03SJunchao Zhang ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 46840e23c03SJunchao Zhang return MPI_SUCCESS; 46940e23c03SJunchao Zhang } 47040e23c03SJunchao Zhang #endif 47140e23c03SJunchao Zhang 47271438e86SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link) 47340e23c03SJunchao Zhang { 47440e23c03SJunchao Zhang PetscErrorCode ierr; 475cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 47671438e86SJunchao Zhang PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 47771438e86SJunchao Zhang 47871438e86SJunchao Zhang PetscFunctionBegin; 47971438e86SJunchao Zhang /* Destroy device-specific fields */ 48071438e86SJunchao Zhang if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);} 48171438e86SJunchao Zhang 48271438e86SJunchao Zhang /* Destroy host related fields */ 48371438e86SJunchao Zhang if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);} 48471438e86SJunchao Zhang if (!link->use_nvshmem) { 48571438e86SJunchao Zhang for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 48671438e86SJunchao Zhang if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);} 48771438e86SJunchao Zhang } 48871438e86SJunchao Zhang ierr = PetscFree(link->reqs);CHKERRQ(ierr); 48971438e86SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 49071438e86SJunchao Zhang ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 49171438e86SJunchao Zhang ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 49271438e86SJunchao Zhang } 49371438e86SJunchao Zhang } 49471438e86SJunchao Zhang ierr = PetscFree(link);CHKERRQ(ierr); 49571438e86SJunchao Zhang PetscFunctionReturn(0); 49671438e86SJunchao Zhang } 49771438e86SJunchao Zhang 49871438e86SJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 49971438e86SJunchao Zhang { 50071438e86SJunchao Zhang PetscErrorCode ierr; 501cd620004SJunchao Zhang 502cd620004SJunchao Zhang PetscFunctionBegin; 503cd620004SJunchao Zhang ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 50471438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM) 50571438e86SJunchao Zhang { 50671438e86SJunchao Zhang PetscBool use_nvshmem; 50771438e86SJunchao Zhang ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr); 50871438e86SJunchao Zhang if (use_nvshmem) { 50971438e86SJunchao Zhang ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 51071438e86SJunchao Zhang PetscFunctionReturn(0); 511cd620004SJunchao Zhang } 512cd620004SJunchao Zhang } 5137fd2d3dbSJunchao Zhang #endif 51471438e86SJunchao Zhang ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 515cd620004SJunchao Zhang PetscFunctionReturn(0); 516cd620004SJunchao Zhang } 517cd620004SJunchao Zhang 518cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 519cd620004SJunchao Zhang If the sf uses persistent requests and the requests have not been initialized, then initialize them. 520cd620004SJunchao Zhang */ 521cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 522cd620004SJunchao Zhang { 523cd620004SJunchao Zhang PetscErrorCode ierr; 524cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 525c87b50c4SJunchao Zhang PetscInt i,j,cnt,nrootranks,ndrootranks,nleafranks,ndleafranks; 526cd620004SJunchao Zhang const PetscInt *rootoffset,*leafoffset; 527cd620004SJunchao Zhang MPI_Aint disp; 528cd620004SJunchao Zhang MPI_Comm comm = PetscObjectComm((PetscObject)sf); 529cd620004SJunchao Zhang MPI_Datatype unit = link->unit; 530cd620004SJunchao Zhang const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 531cd620004SJunchao Zhang const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 532cd620004SJunchao Zhang 533cd620004SJunchao Zhang PetscFunctionBegin; 534cd620004SJunchao Zhang /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 535cd620004SJunchao Zhang if (sf->persistent) { 536cd620004SJunchao Zhang if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 537cd620004SJunchao Zhang ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 538cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 539cd620004SJunchao Zhang for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 540cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 541c87b50c4SJunchao Zhang cnt = rootoffset[i+1]-rootoffset[i]; 542c87b50c4SJunchao Zhang ierr = MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 543cd620004SJunchao Zhang } 544cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 545cd620004SJunchao Zhang for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 546cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 547c87b50c4SJunchao Zhang cnt = rootoffset[i+1]-rootoffset[i]; 548c87b50c4SJunchao Zhang ierr = MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 549cd620004SJunchao Zhang } 550cd620004SJunchao Zhang } 551cd620004SJunchao Zhang link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 552cd620004SJunchao Zhang } 553cd620004SJunchao Zhang 554cd620004SJunchao Zhang if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 555cd620004SJunchao Zhang ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 556cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 557cd620004SJunchao Zhang for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 558cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 559c87b50c4SJunchao Zhang cnt = leafoffset[i+1]-leafoffset[i]; 560c87b50c4SJunchao Zhang ierr = MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 561cd620004SJunchao Zhang } 562cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 563cd620004SJunchao Zhang for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 564cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 565c87b50c4SJunchao Zhang cnt = leafoffset[i+1]-leafoffset[i]; 566c87b50c4SJunchao Zhang ierr = MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 567cd620004SJunchao Zhang } 568cd620004SJunchao Zhang } 569cd620004SJunchao Zhang link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 570cd620004SJunchao Zhang } 571cd620004SJunchao Zhang } 572cd620004SJunchao Zhang if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 573cd620004SJunchao Zhang if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 574cd620004SJunchao Zhang if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 575cd620004SJunchao Zhang if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 576cd620004SJunchao Zhang PetscFunctionReturn(0); 577cd620004SJunchao Zhang } 578cd620004SJunchao Zhang 579cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 580cd620004SJunchao Zhang { 581cd620004SJunchao Zhang PetscErrorCode ierr; 582cd620004SJunchao Zhang PetscSFLink link,*p; 58340e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 58440e23c03SJunchao Zhang 58540e23c03SJunchao Zhang PetscFunctionBegin; 58640e23c03SJunchao Zhang /* Look for types in cache */ 58740e23c03SJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 58840e23c03SJunchao Zhang PetscBool match; 58940e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 590637e6665SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 59140e23c03SJunchao Zhang switch (cmode) { 59240e23c03SJunchao Zhang case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 59340e23c03SJunchao Zhang case PETSC_USE_POINTER: break; 59440e23c03SJunchao Zhang default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 59540e23c03SJunchao Zhang } 59640e23c03SJunchao Zhang *mylink = link; 59740e23c03SJunchao Zhang PetscFunctionReturn(0); 59840e23c03SJunchao Zhang } 59940e23c03SJunchao Zhang } 60040e23c03SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 60140e23c03SJunchao Zhang } 60240e23c03SJunchao Zhang 60371438e86SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink) 60440e23c03SJunchao Zhang { 60540e23c03SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 60671438e86SJunchao Zhang PetscSFLink link = *mylink; 60740e23c03SJunchao Zhang 60840e23c03SJunchao Zhang PetscFunctionBegin; 60971438e86SJunchao Zhang link->rootdata = NULL; 61071438e86SJunchao Zhang link->leafdata = NULL; 61171438e86SJunchao Zhang link->next = bas->avail; 61271438e86SJunchao Zhang bas->avail = link; 61371438e86SJunchao Zhang *mylink = NULL; 614eb02082bSJunchao Zhang PetscFunctionReturn(0); 615eb02082bSJunchao Zhang } 616eb02082bSJunchao Zhang 6179d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */ 618cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 6199d1c8addSJunchao Zhang { 6209d1c8addSJunchao Zhang PetscErrorCode ierr; 621cd620004SJunchao Zhang PetscSFLink link,*p; 6229d1c8addSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 6239d1c8addSJunchao Zhang PetscBool match; 6249d1c8addSJunchao Zhang 6259d1c8addSJunchao Zhang PetscFunctionBegin; 626b458e8f1SJose E. Roman if (PetscDefined(USE_DEBUG)) { 62718fb5014SJunchao Zhang /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 62818fb5014SJunchao Zhang the potential overlapping since this process does not participate in communication. Overlapping is harmless. 62918fb5014SJunchao Zhang */ 630637e6665SJunchao Zhang if (rootdata || leafdata) { 6319d1c8addSJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 6329d1c8addSJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 633*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(match && (rootdata == link->rootdata) && (leafdata == link->leafdata),PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 6349d1c8addSJunchao Zhang } 63518fb5014SJunchao Zhang } 636b458e8f1SJose E. Roman } 6379d1c8addSJunchao Zhang PetscFunctionReturn(0); 6389d1c8addSJunchao Zhang } 6399d1c8addSJunchao Zhang 64020c24465SJunchao Zhang static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 64120c24465SJunchao Zhang { 64220c24465SJunchao Zhang PetscFunctionBegin; 64320c24465SJunchao Zhang if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 64420c24465SJunchao Zhang PetscFunctionReturn(0); 64520c24465SJunchao Zhang } 64620c24465SJunchao Zhang 647cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 64840e23c03SJunchao Zhang { 64940e23c03SJunchao Zhang PetscErrorCode ierr; 650b23bfdefSJunchao Zhang PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 651b23bfdefSJunchao Zhang PetscBool is2Int,is2PetscInt; 65240e23c03SJunchao Zhang PetscMPIInt ni,na,nd,combiner; 65340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 654b23bfdefSJunchao Zhang PetscInt nPetscComplex=0; 65540e23c03SJunchao Zhang #endif 65640e23c03SJunchao Zhang 65740e23c03SJunchao Zhang PetscFunctionBegin; 658b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 659b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 660b23bfdefSJunchao Zhang /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 661b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 662b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 663b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 66440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 665b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 66640e23c03SJunchao Zhang #endif 66740e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 66840e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 669b23bfdefSJunchao Zhang /* TODO: shaell we also handle Fortran MPI_2REAL? */ 670ffc4695bSBarry Smith ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr); 6715ad15460SJunchao Zhang link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 672b23bfdefSJunchao Zhang link->bs = 1; /* default */ 67340e23c03SJunchao Zhang 674eb02082bSJunchao Zhang if (is2Int) { 675cd620004SJunchao Zhang PackInit_PairType_int_int_1_1(link); 676eb02082bSJunchao Zhang link->bs = 1; 677eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(int); 6785ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 679eb02082bSJunchao Zhang link->basicunit = MPI_2INT; 6805ad15460SJunchao Zhang link->unit = MPI_2INT; 681eb02082bSJunchao Zhang } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 682cd620004SJunchao Zhang PackInit_PairType_PetscInt_PetscInt_1_1(link); 683eb02082bSJunchao Zhang link->bs = 1; 684eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(PetscInt); 685eb02082bSJunchao Zhang link->basicunit = MPIU_2INT; 6865ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 6875ad15460SJunchao Zhang link->unit = MPIU_2INT; 688eb02082bSJunchao Zhang } else if (nPetscReal) { 689b23bfdefSJunchao Zhang if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 690b23bfdefSJunchao Zhang else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 691b23bfdefSJunchao Zhang else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 692b23bfdefSJunchao Zhang else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 693b23bfdefSJunchao Zhang link->bs = nPetscReal; 694eb02082bSJunchao Zhang link->unitbytes = nPetscReal*sizeof(PetscReal); 69540e23c03SJunchao Zhang link->basicunit = MPIU_REAL; 6965ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 697b23bfdefSJunchao Zhang } else if (nPetscInt) { 698b23bfdefSJunchao Zhang if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 699b23bfdefSJunchao Zhang else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 700b23bfdefSJunchao Zhang else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 701b23bfdefSJunchao Zhang else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 702b23bfdefSJunchao Zhang link->bs = nPetscInt; 703eb02082bSJunchao Zhang link->unitbytes = nPetscInt*sizeof(PetscInt); 704b23bfdefSJunchao Zhang link->basicunit = MPIU_INT; 7055ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 706b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) 707b23bfdefSJunchao Zhang } else if (nInt) { 708b23bfdefSJunchao Zhang if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 709b23bfdefSJunchao Zhang else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 710b23bfdefSJunchao Zhang else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 711b23bfdefSJunchao Zhang else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 712b23bfdefSJunchao Zhang link->bs = nInt; 713eb02082bSJunchao Zhang link->unitbytes = nInt*sizeof(int); 714b23bfdefSJunchao Zhang link->basicunit = MPI_INT; 7155ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 716b23bfdefSJunchao Zhang #endif 717b23bfdefSJunchao Zhang } else if (nSignedChar) { 718b23bfdefSJunchao Zhang if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 719b23bfdefSJunchao Zhang else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 720b23bfdefSJunchao Zhang else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 721b23bfdefSJunchao Zhang else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 722b23bfdefSJunchao Zhang link->bs = nSignedChar; 723eb02082bSJunchao Zhang link->unitbytes = nSignedChar*sizeof(SignedChar); 724b23bfdefSJunchao Zhang link->basicunit = MPI_SIGNED_CHAR; 7255ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 726b23bfdefSJunchao Zhang } else if (nUnsignedChar) { 727b23bfdefSJunchao Zhang if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 728b23bfdefSJunchao Zhang else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 729b23bfdefSJunchao Zhang else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 730b23bfdefSJunchao Zhang else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 731b23bfdefSJunchao Zhang link->bs = nUnsignedChar; 732eb02082bSJunchao Zhang link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 733b23bfdefSJunchao Zhang link->basicunit = MPI_UNSIGNED_CHAR; 7345ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 73540e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 736b23bfdefSJunchao Zhang } else if (nPetscComplex) { 737b23bfdefSJunchao Zhang if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 738b23bfdefSJunchao Zhang else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 739b23bfdefSJunchao Zhang else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 740b23bfdefSJunchao Zhang else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 741b23bfdefSJunchao Zhang link->bs = nPetscComplex; 742eb02082bSJunchao Zhang link->unitbytes = nPetscComplex*sizeof(PetscComplex); 74340e23c03SJunchao Zhang link->basicunit = MPIU_COMPLEX; 7445ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 74540e23c03SJunchao Zhang #endif 74640e23c03SJunchao Zhang } else { 747b23bfdefSJunchao Zhang MPI_Aint lb,nbyte; 748ffc4695bSBarry Smith ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr); 749*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(lb != 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb); 750eb02082bSJunchao Zhang if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 751eb02082bSJunchao Zhang if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 752eb02082bSJunchao Zhang else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 753eb02082bSJunchao Zhang else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 754eb02082bSJunchao Zhang link->bs = nbyte; 755b23bfdefSJunchao Zhang link->unitbytes = nbyte; 756b23bfdefSJunchao Zhang link->basicunit = MPI_BYTE; 75740e23c03SJunchao Zhang } else { 758eb02082bSJunchao Zhang nInt = nbyte / sizeof(int); 759eb02082bSJunchao Zhang if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 760eb02082bSJunchao Zhang else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 761eb02082bSJunchao Zhang else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 762eb02082bSJunchao Zhang else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 763eb02082bSJunchao Zhang link->bs = nInt; 764b23bfdefSJunchao Zhang link->unitbytes = nbyte; 76540e23c03SJunchao Zhang link->basicunit = MPI_INT; 76640e23c03SJunchao Zhang } 7675ad15460SJunchao Zhang if (link->isbuiltin) link->unit = unit; 76840e23c03SJunchao Zhang } 769b23bfdefSJunchao Zhang 770ffc4695bSBarry Smith if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);} 77120c24465SJunchao Zhang 77220c24465SJunchao Zhang link->Memcpy = PetscSFLinkMemcpy_Host; 77340e23c03SJunchao Zhang PetscFunctionReturn(0); 77440e23c03SJunchao Zhang } 77540e23c03SJunchao Zhang 776fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 77740e23c03SJunchao Zhang { 77840e23c03SJunchao Zhang PetscFunctionBegin; 77940e23c03SJunchao Zhang *UnpackAndOp = NULL; 78071438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) { 78183df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 782eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 783eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 784eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 785eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 786eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 787eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 788eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 789eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 790eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 791eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 792eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 793eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 794eb02082bSJunchao Zhang } 7957fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 79671438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) { 79783df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 798eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 799eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 800eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 801eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 802eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 803eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 804eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 805eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 806eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 807eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 808eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 809eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 81071438e86SJunchao Zhang } else if (PetscMemTypeDevice(mtype) && atomic) { 81183df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 812eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 813eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 814eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 815eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 816eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 817eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 818eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 819eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 820eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 821eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 822eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 823eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 824eb02082bSJunchao Zhang } 825eb02082bSJunchao Zhang #endif 82640e23c03SJunchao Zhang PetscFunctionReturn(0); 82740e23c03SJunchao Zhang } 82840e23c03SJunchao Zhang 829fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 83040e23c03SJunchao Zhang { 83140e23c03SJunchao Zhang PetscFunctionBegin; 832cd620004SJunchao Zhang *ScatterAndOp = NULL; 83371438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) { 83483df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 835cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 836cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 837cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 838cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 839cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 840cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 841cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 842cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 843cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 844cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 845cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 846cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 847eb02082bSJunchao Zhang } 8487fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 84971438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) { 85083df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 851cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 852cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 853cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 854cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 855cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 856cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 857cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 858cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 859cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 860cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 861cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 862cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 86371438e86SJunchao Zhang } else if (PetscMemTypeDevice(mtype) && atomic) { 86483df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 865cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 866cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 867cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 868cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 869cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 870cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 871cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 872cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 873cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 874cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 875cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 876cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 877eb02082bSJunchao Zhang } 878eb02082bSJunchao Zhang #endif 879cd620004SJunchao Zhang PetscFunctionReturn(0); 880cd620004SJunchao Zhang } 881cd620004SJunchao Zhang 882fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 883cd620004SJunchao Zhang { 884cd620004SJunchao Zhang PetscFunctionBegin; 885cd620004SJunchao Zhang *FetchAndOp = NULL; 886*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(op != MPI_SUM && op != MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 88771438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 8887fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 88971438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 89071438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 891cd620004SJunchao Zhang #endif 892cd620004SJunchao Zhang PetscFunctionReturn(0); 893cd620004SJunchao Zhang } 894cd620004SJunchao Zhang 895fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 896cd620004SJunchao Zhang { 897cd620004SJunchao Zhang PetscFunctionBegin; 898cd620004SJunchao Zhang *FetchAndOpLocal = NULL; 899*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(op != MPI_SUM && op != MPIU_SUM,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 90071438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 9017fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 90271438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 90371438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 904cd620004SJunchao Zhang #endif 905cd620004SJunchao Zhang PetscFunctionReturn(0); 906cd620004SJunchao Zhang } 907cd620004SJunchao Zhang 9089fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 909cd620004SJunchao Zhang { 910cd620004SJunchao Zhang PetscErrorCode ierr; 911cd620004SJunchao Zhang PetscLogDouble flops; 912cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 913cd620004SJunchao Zhang 914cd620004SJunchao Zhang PetscFunctionBegin; 91583df288dSJunchao Zhang if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 916cd620004SJunchao Zhang flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 9177fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 91871438e86SJunchao Zhang if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 919cd620004SJunchao Zhang #endif 920cd620004SJunchao Zhang {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 921cd620004SJunchao Zhang } 922cd620004SJunchao Zhang PetscFunctionReturn(0); 923cd620004SJunchao Zhang } 924cd620004SJunchao Zhang 9259fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 926cd620004SJunchao Zhang { 927cd620004SJunchao Zhang PetscLogDouble flops; 928cd620004SJunchao Zhang PetscErrorCode ierr; 929cd620004SJunchao Zhang 930cd620004SJunchao Zhang PetscFunctionBegin; 93183df288dSJunchao Zhang if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 932cd620004SJunchao Zhang flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 9337fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 93471438e86SJunchao Zhang if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 935cd620004SJunchao Zhang #endif 936cd620004SJunchao Zhang {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 937cd620004SJunchao Zhang } 938cd620004SJunchao Zhang PetscFunctionReturn(0); 939cd620004SJunchao Zhang } 940cd620004SJunchao Zhang 941cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 9424165533cSJose E. Roman Input Parameters: 943cd620004SJunchao Zhang +sf - The StarForest 944cd620004SJunchao Zhang .link - The link 945cd620004SJunchao Zhang .count - Number of entries to unpack 946cd620004SJunchao Zhang .start - The first index, significent when indices=NULL 947cd620004SJunchao Zhang .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 948cd620004SJunchao Zhang .buf - A contiguous buffer to unpack from 949cd620004SJunchao Zhang -op - Operation after unpack 950cd620004SJunchao Zhang 9514165533cSJose E. Roman Output Parameters: 952cd620004SJunchao Zhang .data - The data to unpack to 953cd620004SJunchao Zhang */ 9549fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 955cd620004SJunchao Zhang { 956cd620004SJunchao Zhang PetscFunctionBegin; 957cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 958cd620004SJunchao Zhang { 959cd620004SJunchao Zhang PetscErrorCode ierr; 960cd620004SJunchao Zhang PetscInt i; 961cd620004SJunchao Zhang if (indices) { 962cd620004SJunchao Zhang /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 963cd620004SJunchao Zhang basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 964cd620004SJunchao Zhang */ 965ffc4695bSBarry Smith for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);} 966cd620004SJunchao Zhang } else { 967c87b50c4SJunchao Zhang ierr = MPIU_Reduce_local(buf,(char*)data+start*link->unitbytes,count,link->unit,op);CHKERRMPI(ierr); 968cd620004SJunchao Zhang } 969cd620004SJunchao Zhang } 970b458e8f1SJose E. Roman PetscFunctionReturn(0); 971cd620004SJunchao Zhang #else 972cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 973cd620004SJunchao Zhang #endif 974cd620004SJunchao Zhang } 975cd620004SJunchao Zhang 9769fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 977cd620004SJunchao Zhang { 978cd620004SJunchao Zhang PetscFunctionBegin; 979cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 980cd620004SJunchao Zhang { 981cd620004SJunchao Zhang PetscErrorCode ierr; 982cd620004SJunchao Zhang PetscInt i,disp; 983fcc7397dSJunchao Zhang if (!srcIdx) { 984fcc7397dSJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 985fcc7397dSJunchao Zhang } else { 986cd620004SJunchao Zhang for (i=0; i<count; i++) { 987fcc7397dSJunchao Zhang disp = dstIdx? dstIdx[i] : dstStart + i; 988c87b50c4SJunchao Zhang ierr = MPIU_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr); 989fcc7397dSJunchao Zhang } 990cd620004SJunchao Zhang } 991cd620004SJunchao Zhang } 992b458e8f1SJose E. Roman PetscFunctionReturn(0); 993cd620004SJunchao Zhang #else 994cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 995cd620004SJunchao Zhang #endif 996cd620004SJunchao Zhang } 997cd620004SJunchao Zhang 998cd620004SJunchao Zhang /*============================================================================= 999cd620004SJunchao Zhang Pack/Unpack/Fetch/Scatter routines 1000cd620004SJunchao Zhang ============================================================================*/ 1001cd620004SJunchao Zhang 1002cd620004SJunchao Zhang /* Pack rootdata to rootbuf 10034165533cSJose E. Roman Input Parameters: 1004cd620004SJunchao Zhang + sf - The SF this packing works on. 1005cd620004SJunchao Zhang . link - It gives the memtype of the roots and also provides root buffer. 1006cd620004SJunchao Zhang . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1007cd620004SJunchao Zhang - rootdata - Where to read the roots. 1008cd620004SJunchao Zhang 1009cd620004SJunchao Zhang Notes: 1010cd620004SJunchao Zhang When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 101171438e86SJunchao Zhang in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 1012cd620004SJunchao Zhang */ 101371438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1014cd620004SJunchao Zhang { 1015cd620004SJunchao Zhang PetscErrorCode ierr; 1016cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1017cd620004SJunchao Zhang PetscInt count,start; 1018fcc7397dSJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1019cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1020fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1021fcc7397dSJunchao Zhang 1022cd620004SJunchao Zhang PetscFunctionBegin; 1023cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 102471438e86SJunchao Zhang if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1025fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1026cd620004SJunchao Zhang ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1027fcc7397dSJunchao Zhang ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1028cd620004SJunchao Zhang } 1029cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1030cd620004SJunchao Zhang PetscFunctionReturn(0); 1031cd620004SJunchao Zhang } 1032cd620004SJunchao Zhang 1033cd620004SJunchao Zhang /* Pack leafdata to leafbuf */ 103471438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1035cd620004SJunchao Zhang { 1036cd620004SJunchao Zhang PetscErrorCode ierr; 1037cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 1038cd620004SJunchao Zhang PetscInt count,start; 1039fcc7397dSJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1040cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 1041fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1042cd620004SJunchao Zhang 1043cd620004SJunchao Zhang PetscFunctionBegin; 1044cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 104571438e86SJunchao Zhang if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1046fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1047cd620004SJunchao Zhang ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1048fcc7397dSJunchao Zhang ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1049cd620004SJunchao Zhang } 1050cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1051cd620004SJunchao Zhang PetscFunctionReturn(0); 1052cd620004SJunchao Zhang } 1053cd620004SJunchao Zhang 105471438e86SJunchao Zhang /* Pack rootdata to rootbuf, which are in the same memory space */ 105571438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 105671438e86SJunchao Zhang { 105771438e86SJunchao Zhang PetscErrorCode ierr; 105871438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 105971438e86SJunchao Zhang 106071438e86SJunchao Zhang PetscFunctionBegin; 106171438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 106271438e86SJunchao Zhang if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 106371438e86SJunchao Zhang if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 106471438e86SJunchao Zhang } 106571438e86SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 106671438e86SJunchao Zhang if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);} 106771438e86SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 106871438e86SJunchao Zhang PetscFunctionReturn(0); 106971438e86SJunchao Zhang } 107071438e86SJunchao Zhang /* Pack leafdata to leafbuf, which are in the same memory space */ 107171438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 107271438e86SJunchao Zhang { 107371438e86SJunchao Zhang PetscErrorCode ierr; 107471438e86SJunchao Zhang 107571438e86SJunchao Zhang PetscFunctionBegin; 107671438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { 107771438e86SJunchao Zhang if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 107871438e86SJunchao Zhang if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 107971438e86SJunchao Zhang } 108071438e86SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 108171438e86SJunchao Zhang if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);} 108271438e86SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 108371438e86SJunchao Zhang PetscFunctionReturn(0); 108471438e86SJunchao Zhang } 108571438e86SJunchao Zhang 108671438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1087cd620004SJunchao Zhang { 1088cd620004SJunchao Zhang PetscErrorCode ierr; 1089cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1090cd620004SJunchao Zhang PetscInt count,start; 1091cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1092fcc7397dSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1093cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1094fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1095cd620004SJunchao Zhang 1096cd620004SJunchao Zhang PetscFunctionBegin; 109771438e86SJunchao Zhang if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1098cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1099cd620004SJunchao Zhang if (UnpackAndOp) { 1100fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1101fcc7397dSJunchao Zhang ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1102cd620004SJunchao Zhang } else { 1103fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1104cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1105cd620004SJunchao Zhang } 1106cd620004SJunchao Zhang } 1107cd620004SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1108cd620004SJunchao Zhang PetscFunctionReturn(0); 1109cd620004SJunchao Zhang } 1110cd620004SJunchao Zhang 111171438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1112cd620004SJunchao Zhang { 1113cd620004SJunchao Zhang PetscErrorCode ierr; 1114cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 1115cd620004SJunchao Zhang PetscInt count,start; 1116fcc7397dSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1117cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 1118fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1119cd620004SJunchao Zhang 1120cd620004SJunchao Zhang PetscFunctionBegin; 112171438e86SJunchao Zhang if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1122cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1123cd620004SJunchao Zhang if (UnpackAndOp) { 1124fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1125fcc7397dSJunchao Zhang ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1126cd620004SJunchao Zhang } else { 1127fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1128cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1129cd620004SJunchao Zhang } 1130cd620004SJunchao Zhang } 1131cd620004SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 113271438e86SJunchao Zhang PetscFunctionReturn(0); 113371438e86SJunchao Zhang } 113471438e86SJunchao Zhang /* Unpack rootbuf to rootdata, which are in the same memory space */ 113571438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 113671438e86SJunchao Zhang { 113771438e86SJunchao Zhang PetscErrorCode ierr; 113871438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 113971438e86SJunchao Zhang 114071438e86SJunchao Zhang PetscFunctionBegin; 114171438e86SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 114271438e86SJunchao Zhang if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);} 1143cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 114471438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { 114571438e86SJunchao Zhang if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 114671438e86SJunchao Zhang if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 114771438e86SJunchao Zhang } 1148cd620004SJunchao Zhang PetscFunctionReturn(0); 1149cd620004SJunchao Zhang } 1150cd620004SJunchao Zhang 115171438e86SJunchao Zhang /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 115271438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 115371438e86SJunchao Zhang { 115471438e86SJunchao Zhang PetscErrorCode ierr; 115571438e86SJunchao Zhang 115671438e86SJunchao Zhang PetscFunctionBegin; 115771438e86SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 11581e1ea65dSPierre Jolivet if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);CHKERRQ(ierr);} 115971438e86SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 116071438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { 116171438e86SJunchao Zhang if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 116271438e86SJunchao Zhang if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 116371438e86SJunchao Zhang } 116471438e86SJunchao Zhang PetscFunctionReturn(0); 116571438e86SJunchao Zhang } 116671438e86SJunchao Zhang 116771438e86SJunchao Zhang /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 116871438e86SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op) 1169cd620004SJunchao Zhang { 1170cd620004SJunchao Zhang PetscErrorCode ierr; 1171cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1172cd620004SJunchao Zhang PetscInt count,start; 1173cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1174fcc7397dSJunchao Zhang PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1175cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1176fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1177cd620004SJunchao Zhang 1178cd620004SJunchao Zhang PetscFunctionBegin; 1179cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 118071438e86SJunchao Zhang if (bas->rootbuflen[PETSCSF_REMOTE]) { 1181cd620004SJunchao Zhang /* Do FetchAndOp on rootdata with rootbuf */ 118271438e86SJunchao Zhang ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr); 118371438e86SJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 118471438e86SJunchao Zhang ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr); 1185cd620004SJunchao Zhang } 118671438e86SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr); 1187cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1188cd620004SJunchao Zhang PetscFunctionReturn(0); 1189cd620004SJunchao Zhang } 1190cd620004SJunchao Zhang 119171438e86SJunchao Zhang PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op) 1192cd620004SJunchao Zhang { 1193cd620004SJunchao Zhang PetscErrorCode ierr; 1194cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1195cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1196cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1197fcc7397dSJunchao Zhang PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 119871438e86SJunchao Zhang PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype; 1199fcc7397dSJunchao Zhang PetscSFPackOpt leafopt = NULL,rootopt = NULL; 120071438e86SJunchao Zhang PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 120171438e86SJunchao Zhang char *srcbuf = NULL,*dstbuf = NULL; 120271438e86SJunchao Zhang PetscBool dstdups; 1203cd620004SJunchao Zhang 1204362febeeSStefano Zampini PetscFunctionBegin; 120571438e86SJunchao Zhang if (!buflen) PetscFunctionReturn(0); 120671438e86SJunchao Zhang if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 120771438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 1208cd620004SJunchao Zhang ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 120971438e86SJunchao Zhang srcmtype = rootmtype; 121071438e86SJunchao Zhang srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 121171438e86SJunchao Zhang dstmtype = leafmtype; 121271438e86SJunchao Zhang dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 121371438e86SJunchao Zhang } else { 121471438e86SJunchao Zhang ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 121571438e86SJunchao Zhang srcmtype = leafmtype; 121671438e86SJunchao Zhang srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 121771438e86SJunchao Zhang dstmtype = rootmtype; 121871438e86SJunchao Zhang dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 121971438e86SJunchao Zhang } 122071438e86SJunchao Zhang ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr); 122171438e86SJunchao Zhang /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 122271438e86SJunchao Zhang if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);} 122371438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 1224cd620004SJunchao Zhang ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1225cd620004SJunchao Zhang } else { 122671438e86SJunchao Zhang ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 122771438e86SJunchao Zhang } 122871438e86SJunchao Zhang } else { 122971438e86SJunchao Zhang dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 123071438e86SJunchao Zhang dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 123171438e86SJunchao Zhang ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr); 1232cd620004SJunchao Zhang if (ScatterAndOp) { 1233fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1234fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 123571438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 1236fcc7397dSJunchao Zhang ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1237cd620004SJunchao Zhang } else { 1238fcc7397dSJunchao Zhang ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 123971438e86SJunchao Zhang } 1240cd620004SJunchao Zhang } else { 1241fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1242fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 124371438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 124471438e86SJunchao Zhang ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 124571438e86SJunchao Zhang } else { 1246fcc7397dSJunchao Zhang ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1247cd620004SJunchao Zhang } 1248cd620004SJunchao Zhang } 124971438e86SJunchao Zhang } 1250cd620004SJunchao Zhang PetscFunctionReturn(0); 1251cd620004SJunchao Zhang } 1252cd620004SJunchao Zhang 1253cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */ 1254cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1255cd620004SJunchao Zhang { 1256cd620004SJunchao Zhang PetscErrorCode ierr; 1257cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1258cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1259cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1260fcc7397dSJunchao Zhang PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1261cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1262fcc7397dSJunchao Zhang PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1263cd620004SJunchao Zhang 1264cd620004SJunchao Zhang PetscFunctionBegin; 1265cd620004SJunchao Zhang if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1266cd620004SJunchao Zhang if (rootmtype != leafmtype) { 1267cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */ 1268cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1269cd620004SJunchao Zhang } else { 1270fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1271fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1272cd620004SJunchao Zhang ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1273fcc7397dSJunchao Zhang ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1274cd620004SJunchao Zhang } 127540e23c03SJunchao Zhang PetscFunctionReturn(0); 127640e23c03SJunchao Zhang } 127740e23c03SJunchao Zhang 127840e23c03SJunchao Zhang /* 1279cd620004SJunchao Zhang Create per-rank pack/unpack optimizations based on indice patterns 128040e23c03SJunchao Zhang 128140e23c03SJunchao Zhang Input Parameters: 1282fcc7397dSJunchao Zhang + n - Number of destination ranks 1283eb02082bSJunchao Zhang . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1284b23bfdefSJunchao Zhang - idx - [*] Array storing indices 128540e23c03SJunchao Zhang 128640e23c03SJunchao Zhang Output Parameters: 1287cd620004SJunchao Zhang + opt - Pack optimizations. NULL if no optimizations. 128840e23c03SJunchao Zhang */ 1289cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 129040e23c03SJunchao Zhang { 129140e23c03SJunchao Zhang PetscErrorCode ierr; 1292fcc7397dSJunchao Zhang PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1293fcc7397dSJunchao Zhang PetscBool optimizable = PETSC_TRUE; 129440e23c03SJunchao Zhang PetscSFPackOpt opt; 129540e23c03SJunchao Zhang 129640e23c03SJunchao Zhang PetscFunctionBegin; 1297fcc7397dSJunchao Zhang ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1298fcc7397dSJunchao Zhang ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1299fcc7397dSJunchao Zhang opt->n = opt->array[0] = n; 1300fcc7397dSJunchao Zhang opt->offset = opt->array + 1; 1301fcc7397dSJunchao Zhang opt->start = opt->array + n + 2; 1302fcc7397dSJunchao Zhang opt->dx = opt->array + 2*n + 2; 1303fcc7397dSJunchao Zhang opt->dy = opt->array + 3*n + 2; 1304fcc7397dSJunchao Zhang opt->dz = opt->array + 4*n + 2; 1305fcc7397dSJunchao Zhang opt->X = opt->array + 5*n + 2; 1306fcc7397dSJunchao Zhang opt->Y = opt->array + 6*n + 2; 1307fcc7397dSJunchao Zhang 1308fcc7397dSJunchao Zhang for (r=0; r<n; r++) { /* For each destination rank */ 1309fcc7397dSJunchao Zhang m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1310fcc7397dSJunchao Zhang p = offset[r]; 1311fcc7397dSJunchao Zhang start = idx[p]; /* First index for this rank */ 1312fcc7397dSJunchao Zhang p++; 1313fcc7397dSJunchao Zhang 1314fcc7397dSJunchao Zhang /* Search in X dimension */ 1315fcc7397dSJunchao Zhang for (dx=1; dx<m; dx++,p++) { 1316fcc7397dSJunchao Zhang if (start+dx != idx[p]) break; 1317b23bfdefSJunchao Zhang } 1318b23bfdefSJunchao Zhang 1319fcc7397dSJunchao Zhang dydz = m/dx; 1320fcc7397dSJunchao Zhang X = dydz > 1 ? (idx[p]-start) : dx; 1321fcc7397dSJunchao Zhang /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1322fcc7397dSJunchao Zhang if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1323fcc7397dSJunchao Zhang for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1324fcc7397dSJunchao Zhang for (i=0; i<dx; i++,p++) { 1325fcc7397dSJunchao Zhang if (start+X*dy+i != idx[p]) { 1326fcc7397dSJunchao Zhang if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1327fcc7397dSJunchao Zhang else goto Z_dimension; 132840e23c03SJunchao Zhang } 132940e23c03SJunchao Zhang } 133040e23c03SJunchao Zhang } 133140e23c03SJunchao Zhang 1332fcc7397dSJunchao Zhang Z_dimension: 1333fcc7397dSJunchao Zhang dz = m/(dx*dy); 1334fcc7397dSJunchao Zhang Y = dz > 1 ? (idx[p]-start)/X : dy; 1335fcc7397dSJunchao Zhang /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1336fcc7397dSJunchao Zhang if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1337fcc7397dSJunchao Zhang for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1338fcc7397dSJunchao Zhang for (j=0; j<dy; j++) { 1339fcc7397dSJunchao Zhang for (i=0; i<dx; i++,p++) { 1340fcc7397dSJunchao Zhang if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 134140e23c03SJunchao Zhang } 134240e23c03SJunchao Zhang } 134340e23c03SJunchao Zhang } 1344fcc7397dSJunchao Zhang opt->start[r] = start; 1345fcc7397dSJunchao Zhang opt->dx[r] = dx; 1346fcc7397dSJunchao Zhang opt->dy[r] = dy; 1347fcc7397dSJunchao Zhang opt->dz[r] = dz; 1348fcc7397dSJunchao Zhang opt->X[r] = X; 1349fcc7397dSJunchao Zhang opt->Y[r] = Y; 135040e23c03SJunchao Zhang } 135140e23c03SJunchao Zhang 1352fcc7397dSJunchao Zhang finish: 1353fcc7397dSJunchao Zhang /* If not optimizable, free arrays to save memory */ 1354fcc7397dSJunchao Zhang if (!n || !optimizable) { 1355fcc7397dSJunchao Zhang ierr = PetscFree(opt->array);CHKERRQ(ierr); 135640e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 135740e23c03SJunchao Zhang *out = NULL; 1358fcc7397dSJunchao Zhang } else { 1359fcc7397dSJunchao Zhang opt->offset[0] = 0; 1360fcc7397dSJunchao Zhang for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1361fcc7397dSJunchao Zhang *out = opt; 1362fcc7397dSJunchao Zhang } 136340e23c03SJunchao Zhang PetscFunctionReturn(0); 136440e23c03SJunchao Zhang } 136540e23c03SJunchao Zhang 13669fbee547SJacob Faibussowitsch static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) 136740e23c03SJunchao Zhang { 136840e23c03SJunchao Zhang PetscErrorCode ierr; 136940e23c03SJunchao Zhang PetscSFPackOpt opt = *out; 137040e23c03SJunchao Zhang 137140e23c03SJunchao Zhang PetscFunctionBegin; 137240e23c03SJunchao Zhang if (opt) { 137320c24465SJunchao Zhang ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr); 137440e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 137540e23c03SJunchao Zhang *out = NULL; 137640e23c03SJunchao Zhang } 137740e23c03SJunchao Zhang PetscFunctionReturn(0); 137840e23c03SJunchao Zhang } 1379cd620004SJunchao Zhang 1380cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1381cd620004SJunchao Zhang { 1382cd620004SJunchao Zhang PetscErrorCode ierr; 1383cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1384cd620004SJunchao Zhang PetscInt i,j; 1385cd620004SJunchao Zhang 1386cd620004SJunchao Zhang PetscFunctionBegin; 1387cd620004SJunchao Zhang /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1388cd620004SJunchao Zhang for (i=0; i<2; i++) { /* Set defaults */ 1389cd620004SJunchao Zhang sf->leafstart[i] = 0; 1390cd620004SJunchao Zhang sf->leafcontig[i] = PETSC_TRUE; 1391cd620004SJunchao Zhang sf->leafdups[i] = PETSC_FALSE; 1392cd620004SJunchao Zhang bas->rootstart[i] = 0; 1393cd620004SJunchao Zhang bas->rootcontig[i] = PETSC_TRUE; 1394cd620004SJunchao Zhang bas->rootdups[i] = PETSC_FALSE; 1395cd620004SJunchao Zhang } 1396cd620004SJunchao Zhang 1397cd620004SJunchao Zhang sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1398cd620004SJunchao Zhang sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1399cd620004SJunchao Zhang 1400cd620004SJunchao Zhang if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1401cd620004SJunchao Zhang if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1402cd620004SJunchao Zhang 1403cd620004SJunchao Zhang /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1404cd620004SJunchao Zhang for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1405cd620004SJunchao Zhang if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1406cd620004SJunchao Zhang } 1407cd620004SJunchao Zhang for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1408cd620004SJunchao Zhang if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1409cd620004SJunchao Zhang } 1410cd620004SJunchao Zhang 1411cd620004SJunchao Zhang /* If not, see if we can have per-rank optimizations by doing index analysis */ 1412cd620004SJunchao Zhang if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1413cd620004SJunchao Zhang if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1414cd620004SJunchao Zhang 1415cd620004SJunchao Zhang /* Are root indices for self and remote contiguous? */ 1416cd620004SJunchao Zhang bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1417cd620004SJunchao Zhang bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1418cd620004SJunchao Zhang 1419cd620004SJunchao Zhang if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1420cd620004SJunchao Zhang if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1421cd620004SJunchao Zhang 1422cd620004SJunchao Zhang for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1423cd620004SJunchao Zhang if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1424cd620004SJunchao Zhang } 1425cd620004SJunchao Zhang for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1426cd620004SJunchao Zhang if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1427cd620004SJunchao Zhang } 1428cd620004SJunchao Zhang 1429cd620004SJunchao Zhang if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1430cd620004SJunchao Zhang if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1431cd620004SJunchao Zhang 14327fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 1433cd620004SJunchao Zhang /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1434013b3241SStefano Zampini if (PetscDefined(HAVE_DEVICE)) { 1435013b3241SStefano Zampini PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 1436013b3241SStefano Zampini if (!sf->leafcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1437013b3241SStefano Zampini if (!sf->leafcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1438013b3241SStefano Zampini if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1439013b3241SStefano Zampini if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1440013b3241SStefano Zampini } 1441cd620004SJunchao Zhang #endif 1442cd620004SJunchao Zhang PetscFunctionReturn(0); 1443cd620004SJunchao Zhang } 1444cd620004SJunchao Zhang 1445cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1446cd620004SJunchao Zhang { 1447cd620004SJunchao Zhang PetscErrorCode ierr; 1448cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1449cd620004SJunchao Zhang PetscInt i; 1450cd620004SJunchao Zhang 1451cd620004SJunchao Zhang PetscFunctionBegin; 1452cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 145320c24465SJunchao Zhang ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 145420c24465SJunchao Zhang ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 14557fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 145620c24465SJunchao Zhang ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 145720c24465SJunchao Zhang ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 14587fd2d3dbSJunchao Zhang #endif 1459cd620004SJunchao Zhang } 1460cd620004SJunchao Zhang PetscFunctionReturn(0); 1461cd620004SJunchao Zhang } 1462