120c24465SJunchao Zhang #include "petsc/private/sfimpl.h" 240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 440e23c03SJunchao Zhang 57fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 67fd2d3dbSJunchao Zhang 7eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 8eb02082bSJunchao Zhang #include <cuda_runtime.h> 97fd2d3dbSJunchao Zhang #include <petsccublas.h> 10eb02082bSJunchao Zhang #endif 1159af0bd3SScott Kruger #if defined(PETSC_HAVE_HIP) 1259af0bd3SScott Kruger #include <hip/hip_runtime.h> 1359af0bd3SScott Kruger #include <petschipblas.h> 1459af0bd3SScott Kruger #endif 1540e23c03SJunchao Zhang /* 1640e23c03SJunchao Zhang * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 1740e23c03SJunchao Zhang * therefore we pack data types manually. This file defines packing routines for the standard data types. 1840e23c03SJunchao Zhang */ 1940e23c03SJunchao Zhang 20cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 2140e23c03SJunchao Zhang 22cd620004SJunchao Zhang /* Operations working like s += t */ 23cd620004SJunchao Zhang #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 24cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 25cd620004SJunchao Zhang #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ 26cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) 27cd620004SJunchao Zhang /* Ref MPI MAXLOC */ 28cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \ 29cd620004SJunchao Zhang do { \ 30cd620004SJunchao Zhang if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 31cd620004SJunchao Zhang else if (!((s).u op (t).u)) s = t; \ 32cd620004SJunchao Zhang } while (0) 3340e23c03SJunchao Zhang 3440e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine 3540e23c03SJunchao Zhang 3640e23c03SJunchao Zhang Arguments of the macro: 37b23bfdefSJunchao Zhang +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 38fcc7397dSJunchao Zhang .BS Block size for vectorization. It is a factor of bsz. 39b23bfdefSJunchao Zhang -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 4040e23c03SJunchao Zhang 4140e23c03SJunchao Zhang Arguments of the Pack routine: 42cd620004SJunchao Zhang +count Number of indices in idx[]. 43fcc7397dSJunchao Zhang .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 44fcc7397dSJunchao Zhang .opt Per-pack optimization plan. NULL means no such plan. 45fcc7397dSJunchao Zhang .idx Indices of entries to packed. 46eb02082bSJunchao Zhang .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 47cd620004SJunchao Zhang .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 48cd620004SJunchao Zhang -packed Address of the packed data. 4940e23c03SJunchao Zhang */ 50b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \ 51fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ 52b23bfdefSJunchao Zhang { \ 5340e23c03SJunchao Zhang PetscErrorCode ierr; \ 54b23bfdefSJunchao Zhang const Type *u = (const Type*)unpacked,*u2; \ 55b23bfdefSJunchao Zhang Type *p = (Type*)packed,*p2; \ 56fcc7397dSJunchao Zhang PetscInt i,j,k,X,Y,r,bs=link->bs; \ 57fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 58b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 5940e23c03SJunchao Zhang PetscFunctionBegin; \ 60fcc7397dSJunchao Zhang if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ 61fcc7397dSJunchao Zhang else if (opt) { /* has optimizations available */ \ 62fcc7397dSJunchao Zhang p2 = p; \ 63fcc7397dSJunchao Zhang for (r=0; r<opt->n; r++) { \ 64fcc7397dSJunchao Zhang u2 = u + opt->start[r]*MBS; \ 65fcc7397dSJunchao Zhang X = opt->X[r]; \ 66fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 67fcc7397dSJunchao Zhang for (k=0; k<opt->dz[r]; k++) \ 68fcc7397dSJunchao Zhang for (j=0; j<opt->dy[r]; j++) { \ 69fcc7397dSJunchao Zhang ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ 70fcc7397dSJunchao Zhang p2 += opt->dx[r]*MBS; \ 71fcc7397dSJunchao Zhang } \ 72fcc7397dSJunchao Zhang } \ 73fcc7397dSJunchao Zhang } else { \ 74b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 75eb02082bSJunchao Zhang for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 76eb02082bSJunchao Zhang for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 77b23bfdefSJunchao Zhang p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 7840e23c03SJunchao Zhang } \ 7940e23c03SJunchao Zhang PetscFunctionReturn(0); \ 8040e23c03SJunchao Zhang } 8140e23c03SJunchao Zhang 82cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 83cd620004SJunchao Zhang and inserts into a sparse array. 8440e23c03SJunchao Zhang 8540e23c03SJunchao Zhang Arguments: 86b23bfdefSJunchao Zhang .Type Type of the data 8740e23c03SJunchao Zhang .BS Block size for vectorization 88b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 8940e23c03SJunchao Zhang 9040e23c03SJunchao Zhang Notes: 9140e23c03SJunchao Zhang This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 9240e23c03SJunchao Zhang */ 93cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ) \ 94fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 95b23bfdefSJunchao Zhang { \ 9640e23c03SJunchao Zhang PetscErrorCode ierr; \ 97b23bfdefSJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 98fcc7397dSJunchao Zhang const Type *p = (const Type*)packed; \ 99fcc7397dSJunchao Zhang PetscInt i,j,k,X,Y,r,bs=link->bs; \ 100fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 101b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 10240e23c03SJunchao Zhang PetscFunctionBegin; \ 103b23bfdefSJunchao Zhang if (!idx) { \ 104fcc7397dSJunchao Zhang u += start*MBS; \ 105fcc7397dSJunchao Zhang if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ 106fcc7397dSJunchao Zhang } else if (opt) { /* has optimizations available */ \ 107fcc7397dSJunchao Zhang for (r=0; r<opt->n; r++) { \ 108fcc7397dSJunchao Zhang u2 = u + opt->start[r]*MBS; \ 109fcc7397dSJunchao Zhang X = opt->X[r]; \ 110fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 111fcc7397dSJunchao Zhang for (k=0; k<opt->dz[r]; k++) \ 112fcc7397dSJunchao Zhang for (j=0; j<opt->dy[r]; j++) { \ 113fcc7397dSJunchao Zhang ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ 114fcc7397dSJunchao Zhang p += opt->dx[r]*MBS; \ 115fcc7397dSJunchao Zhang } \ 116fcc7397dSJunchao Zhang } \ 117fcc7397dSJunchao Zhang } else { \ 118b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 119b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 120cd620004SJunchao Zhang for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 12140e23c03SJunchao Zhang } \ 12240e23c03SJunchao Zhang PetscFunctionReturn(0); \ 12340e23c03SJunchao Zhang } 12440e23c03SJunchao Zhang 125cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 12640e23c03SJunchao Zhang 12740e23c03SJunchao Zhang Arguments: 128cd620004SJunchao Zhang +Opname Name of the Op, such as Add, Mult, LAND, etc. 129b23bfdefSJunchao Zhang .Type Type of the data 13040e23c03SJunchao Zhang .BS Block size for vectorization 131b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 132cd620004SJunchao Zhang .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 133cd620004SJunchao Zhang .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 13440e23c03SJunchao Zhang */ 135cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 136fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \ 137b23bfdefSJunchao Zhang { \ 138cd620004SJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 139fcc7397dSJunchao Zhang const Type *p = (const Type*)packed; \ 140fcc7397dSJunchao Zhang PetscInt i,j,k,X,Y,r,bs=link->bs; \ 141fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 142b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 14340e23c03SJunchao Zhang PetscFunctionBegin; \ 144b23bfdefSJunchao Zhang if (!idx) { \ 145fcc7397dSJunchao Zhang u += start*MBS; \ 146cd620004SJunchao Zhang for (i=0; i<count; i++) \ 147cd620004SJunchao Zhang for (j=0; j<M; j++) \ 148cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 149cd620004SJunchao Zhang OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 150fcc7397dSJunchao Zhang } else if (opt) { /* idx[] has patterns */ \ 151fcc7397dSJunchao Zhang for (r=0; r<opt->n; r++) { \ 152fcc7397dSJunchao Zhang u2 = u + opt->start[r]*MBS; \ 153fcc7397dSJunchao Zhang X = opt->X[r]; \ 154fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 155fcc7397dSJunchao Zhang for (k=0; k<opt->dz[r]; k++) \ 156fcc7397dSJunchao Zhang for (j=0; j<opt->dy[r]; j++) { \ 157fcc7397dSJunchao Zhang for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ 158fcc7397dSJunchao Zhang p += opt->dx[r]*MBS; \ 159fcc7397dSJunchao Zhang } \ 160fcc7397dSJunchao Zhang } \ 161fcc7397dSJunchao Zhang } else { \ 162cd620004SJunchao Zhang for (i=0; i<count; i++) \ 163cd620004SJunchao Zhang for (j=0; j<M; j++) \ 164cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 165cd620004SJunchao Zhang OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 166cd620004SJunchao Zhang } \ 167cd620004SJunchao Zhang PetscFunctionReturn(0); \ 168cd620004SJunchao Zhang } 169cd620004SJunchao Zhang 170cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 171fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \ 172cd620004SJunchao Zhang { \ 173fcc7397dSJunchao Zhang Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \ 174fcc7397dSJunchao Zhang PetscInt i,j,k,r,l,bs=link->bs; \ 175fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 176fcc7397dSJunchao Zhang const PetscInt MBS = M*BS; \ 177cd620004SJunchao Zhang PetscFunctionBegin; \ 178fcc7397dSJunchao Zhang for (i=0; i<count; i++) { \ 179fcc7397dSJunchao Zhang r = (!idx ? start+i : idx[i])*MBS; \ 180fcc7397dSJunchao Zhang l = i*MBS; \ 181b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 182b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 183fcc7397dSJunchao Zhang tmp = u[r+j*BS+k]; \ 184fcc7397dSJunchao Zhang OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \ 185fcc7397dSJunchao Zhang p[l+j*BS+k] = tmp; \ 186cd620004SJunchao Zhang } \ 187cd620004SJunchao Zhang } \ 188cd620004SJunchao Zhang PetscFunctionReturn(0); \ 189cd620004SJunchao Zhang } 190cd620004SJunchao Zhang 191cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 192fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \ 193cd620004SJunchao Zhang { \ 194fcc7397dSJunchao Zhang PetscErrorCode ierr; \ 195fcc7397dSJunchao Zhang const Type *u = (const Type*)src; \ 196fcc7397dSJunchao Zhang Type *v = (Type*)dst; \ 197fcc7397dSJunchao Zhang PetscInt i,j,k,s,t,X,Y,bs = link->bs; \ 198cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 199cd620004SJunchao Zhang const PetscInt MBS = M*BS; \ 200cd620004SJunchao Zhang PetscFunctionBegin; \ 201fcc7397dSJunchao Zhang if (!srcIdx) { /* src is contiguous */ \ 202fcc7397dSJunchao Zhang u += srcStart*MBS; \ 203fcc7397dSJunchao Zhang ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ 204fcc7397dSJunchao Zhang } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 205fcc7397dSJunchao Zhang u += srcOpt->start[0]*MBS; \ 206fcc7397dSJunchao Zhang v += dstStart*MBS; \ 207fcc7397dSJunchao Zhang X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ 208fcc7397dSJunchao Zhang for (k=0; k<srcOpt->dz[0]; k++) \ 209fcc7397dSJunchao Zhang for (j=0; j<srcOpt->dy[0]; j++) { \ 210fcc7397dSJunchao Zhang for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ 211fcc7397dSJunchao Zhang v += srcOpt->dx[0]*MBS; \ 212fcc7397dSJunchao Zhang } \ 213fcc7397dSJunchao Zhang } else { /* all other cases */ \ 214fcc7397dSJunchao Zhang for (i=0; i<count; i++) { \ 215fcc7397dSJunchao Zhang s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \ 216fcc7397dSJunchao Zhang t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \ 217cd620004SJunchao Zhang for (j=0; j<M; j++) \ 218fcc7397dSJunchao Zhang for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \ 219fcc7397dSJunchao Zhang } \ 220cd620004SJunchao Zhang } \ 221cd620004SJunchao Zhang PetscFunctionReturn(0); \ 222cd620004SJunchao Zhang } 223cd620004SJunchao Zhang 224cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 225fcc7397dSJunchao Zhang static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \ 226cd620004SJunchao Zhang { \ 227fcc7397dSJunchao Zhang Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \ 228fcc7397dSJunchao Zhang const Type *ldata = (const Type*)leafdata; \ 229fcc7397dSJunchao Zhang PetscInt i,j,k,r,l,bs = link->bs; \ 230cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 231cd620004SJunchao Zhang const PetscInt MBS = M*BS; \ 232cd620004SJunchao Zhang PetscFunctionBegin; \ 233fcc7397dSJunchao Zhang for (i=0; i<count; i++) { \ 234fcc7397dSJunchao Zhang r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \ 235fcc7397dSJunchao Zhang l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \ 236cd620004SJunchao Zhang for (j=0; j<M; j++) \ 237cd620004SJunchao Zhang for (k=0; k<BS; k++) { \ 238fcc7397dSJunchao Zhang lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \ 239fcc7397dSJunchao Zhang OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \ 24040e23c03SJunchao Zhang } \ 24140e23c03SJunchao Zhang } \ 24240e23c03SJunchao Zhang PetscFunctionReturn(0); \ 24340e23c03SJunchao Zhang } 24440e23c03SJunchao Zhang 245b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */ 246b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ) \ 247b23bfdefSJunchao Zhang DEF_PackFunc(Type,BS,EQ) \ 248cd620004SJunchao Zhang DEF_UnpackFunc(Type,BS,EQ) \ 249cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 250cd620004SJunchao Zhang static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 251eb02082bSJunchao Zhang link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 252eb02082bSJunchao Zhang link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 253cd620004SJunchao Zhang link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 25440e23c03SJunchao Zhang } 25540e23c03SJunchao Zhang 256b23bfdefSJunchao Zhang /* Add, Mult ops */ 257b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ) \ 258cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 259cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 260cd620004SJunchao Zhang DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 261cd620004SJunchao Zhang DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 262cd620004SJunchao Zhang DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 263cd620004SJunchao Zhang DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 264cd620004SJunchao Zhang static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 265eb02082bSJunchao Zhang link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 266eb02082bSJunchao Zhang link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 267eb02082bSJunchao Zhang link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 268cd620004SJunchao Zhang link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 269cd620004SJunchao Zhang link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 270cd620004SJunchao Zhang link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 27140e23c03SJunchao Zhang } 27240e23c03SJunchao Zhang 273b23bfdefSJunchao Zhang /* Max, Min ops */ 274b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ) \ 275cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 276cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 277cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 278cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 279cd620004SJunchao Zhang static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 280eb02082bSJunchao Zhang link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 281eb02082bSJunchao Zhang link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 282cd620004SJunchao Zhang link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 283cd620004SJunchao Zhang link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 284b23bfdefSJunchao Zhang } 285b23bfdefSJunchao Zhang 286b23bfdefSJunchao Zhang /* Logical ops. 287cd620004SJunchao Zhang The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 28840e23c03SJunchao Zhang the compilation warning "empty macro arguments are undefined in ISO C90" 28940e23c03SJunchao Zhang */ 290b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ) \ 291cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 292cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 293cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 294cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 295cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 296cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 297cd620004SJunchao Zhang static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 298eb02082bSJunchao Zhang link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 299eb02082bSJunchao Zhang link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 300eb02082bSJunchao Zhang link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 301cd620004SJunchao Zhang link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 302cd620004SJunchao Zhang link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 303cd620004SJunchao Zhang link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 30440e23c03SJunchao Zhang } 30540e23c03SJunchao Zhang 306b23bfdefSJunchao Zhang /* Bitwise ops */ 307b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ) \ 308cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 309cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 310cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 311cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 312cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 313cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 314cd620004SJunchao Zhang static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 315eb02082bSJunchao Zhang link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 316eb02082bSJunchao Zhang link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 317eb02082bSJunchao Zhang link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 318cd620004SJunchao Zhang link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 319cd620004SJunchao Zhang link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 320cd620004SJunchao Zhang link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 32140e23c03SJunchao Zhang } 32240e23c03SJunchao Zhang 323cd620004SJunchao Zhang /* Maxloc, Minloc ops */ 324cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ) \ 325cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 326cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 327cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 328cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 329cd620004SJunchao Zhang static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 330cd620004SJunchao Zhang link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 331cd620004SJunchao Zhang link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 332cd620004SJunchao Zhang link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 333cd620004SJunchao Zhang link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 33440e23c03SJunchao Zhang } 33540e23c03SJunchao Zhang 336b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ) \ 337b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 338b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 339b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 340b23bfdefSJunchao Zhang DEF_Log(Type,BS,EQ) \ 341b23bfdefSJunchao Zhang DEF_Bit(Type,BS,EQ) \ 342cd620004SJunchao Zhang static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 343b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 344b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 345b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 346b23bfdefSJunchao Zhang CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 347b23bfdefSJunchao Zhang CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 34840e23c03SJunchao Zhang } 34940e23c03SJunchao Zhang 350b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ) \ 351b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 352b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 353b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 354cd620004SJunchao Zhang static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 355b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 356b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 357b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 358b23bfdefSJunchao Zhang } 35940e23c03SJunchao Zhang 36040e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 361b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ) \ 362b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 363b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 364cd620004SJunchao Zhang static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 365b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 366b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 367b23bfdefSJunchao Zhang } 36840e23c03SJunchao Zhang #endif 369b23bfdefSJunchao Zhang 370b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ) \ 371b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 372cd620004SJunchao Zhang static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 373b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 374b23bfdefSJunchao Zhang } 375b23bfdefSJunchao Zhang 376b23bfdefSJunchao Zhang /* Maxloc, Minloc */ 377cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ) \ 378cd620004SJunchao Zhang DEF_Pack(Type,BS,EQ) \ 379cd620004SJunchao Zhang DEF_Xloc(Type,BS,EQ) \ 380cd620004SJunchao Zhang static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 381cd620004SJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 382cd620004SJunchao Zhang CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 383b23bfdefSJunchao Zhang } 384b23bfdefSJunchao Zhang 385b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 386b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 387b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 388b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 389b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 390b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 391b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 392b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 393b23bfdefSJunchao Zhang 394b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 395b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1) 396b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1) 397b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1) 398b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1) 399b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0) 400b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0) 401b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0) 402b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0) 403b23bfdefSJunchao Zhang #endif 404b23bfdefSJunchao Zhang 405b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */ 406b23bfdefSJunchao Zhang typedef signed char SignedChar; 407b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1) 408b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1) 409b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1) 410b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1) 411b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0) 412b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0) 413b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0) 414b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0) 415b23bfdefSJunchao Zhang 416b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar; 417b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1) 418b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1) 419b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1) 420b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1) 421b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0) 422b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0) 423b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0) 424b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0) 425b23bfdefSJunchao Zhang 426b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1) 427b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1) 428b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1) 429b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1) 430b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0) 431b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0) 432b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0) 433b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0) 434b23bfdefSJunchao Zhang 435b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 436b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1) 437b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1) 438b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1) 439b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1) 440b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0) 441b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0) 442b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0) 443b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0) 444b23bfdefSJunchao Zhang #endif 445b23bfdefSJunchao Zhang 446cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2 447cd620004SJunchao Zhang typedef struct {int u; int i;} PairType(int,int); 448cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 449cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1) 450cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1) 451b23bfdefSJunchao Zhang 452b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */ 453b23bfdefSJunchao Zhang DEF_DumbType(char,1,1) 454b23bfdefSJunchao Zhang DEF_DumbType(char,2,1) 455b23bfdefSJunchao Zhang DEF_DumbType(char,4,1) 456b23bfdefSJunchao Zhang DEF_DumbType(char,1,0) 457b23bfdefSJunchao Zhang DEF_DumbType(char,2,0) 458b23bfdefSJunchao Zhang DEF_DumbType(char,4,0) 459b23bfdefSJunchao Zhang 460eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 461b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1) 462b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1) 463b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1) 464b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1) 465b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0) 466b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0) 467b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0) 468b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0) 46940e23c03SJunchao Zhang 47040e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 47140e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 47240e23c03SJunchao Zhang { 47340e23c03SJunchao Zhang int ierr; 47440e23c03SJunchao Zhang ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 47540e23c03SJunchao Zhang ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 47640e23c03SJunchao Zhang return MPI_SUCCESS; 47740e23c03SJunchao Zhang } 47840e23c03SJunchao Zhang #endif 47940e23c03SJunchao Zhang 480*71438e86SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link) 48140e23c03SJunchao Zhang { 48240e23c03SJunchao Zhang PetscErrorCode ierr; 483cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 484*71438e86SJunchao Zhang PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 485*71438e86SJunchao Zhang 486*71438e86SJunchao Zhang PetscFunctionBegin; 487*71438e86SJunchao Zhang /* Destroy device-specific fields */ 488*71438e86SJunchao Zhang if (link->deviceinited) {ierr = (*link->Destroy)(sf,link);CHKERRQ(ierr);} 489*71438e86SJunchao Zhang 490*71438e86SJunchao Zhang /* Destroy host related fields */ 491*71438e86SJunchao Zhang if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);} 492*71438e86SJunchao Zhang if (!link->use_nvshmem) { 493*71438e86SJunchao Zhang for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 494*71438e86SJunchao Zhang if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);} 495*71438e86SJunchao Zhang } 496*71438e86SJunchao Zhang ierr = PetscFree(link->reqs);CHKERRQ(ierr); 497*71438e86SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 498*71438e86SJunchao Zhang ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 499*71438e86SJunchao Zhang ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 500*71438e86SJunchao Zhang } 501*71438e86SJunchao Zhang } 502*71438e86SJunchao Zhang ierr = PetscFree(link);CHKERRQ(ierr); 503*71438e86SJunchao Zhang PetscFunctionReturn(0); 504*71438e86SJunchao Zhang } 505*71438e86SJunchao Zhang 506*71438e86SJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 507*71438e86SJunchao Zhang { 508*71438e86SJunchao Zhang PetscErrorCode ierr; 509cd620004SJunchao Zhang 510cd620004SJunchao Zhang PetscFunctionBegin; 511cd620004SJunchao Zhang ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 512*71438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM) 513*71438e86SJunchao Zhang { 514*71438e86SJunchao Zhang PetscBool use_nvshmem; 515*71438e86SJunchao Zhang ierr = PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);CHKERRQ(ierr); 516*71438e86SJunchao Zhang if (use_nvshmem) { 517*71438e86SJunchao Zhang ierr = PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 518*71438e86SJunchao Zhang PetscFunctionReturn(0); 519cd620004SJunchao Zhang } 520cd620004SJunchao Zhang } 5217fd2d3dbSJunchao Zhang #endif 522*71438e86SJunchao Zhang ierr = PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);CHKERRQ(ierr); 523cd620004SJunchao Zhang PetscFunctionReturn(0); 524cd620004SJunchao Zhang } 525cd620004SJunchao Zhang 526cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 527cd620004SJunchao Zhang If the sf uses persistent requests and the requests have not been initialized, then initialize them. 528cd620004SJunchao Zhang */ 529cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 530cd620004SJunchao Zhang { 531cd620004SJunchao Zhang PetscErrorCode ierr; 532cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 533cd620004SJunchao Zhang PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 534cd620004SJunchao Zhang const PetscInt *rootoffset,*leafoffset; 535cd620004SJunchao Zhang PetscMPIInt n; 536cd620004SJunchao Zhang MPI_Aint disp; 537cd620004SJunchao Zhang MPI_Comm comm = PetscObjectComm((PetscObject)sf); 538cd620004SJunchao Zhang MPI_Datatype unit = link->unit; 539cd620004SJunchao Zhang const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 540cd620004SJunchao Zhang const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 541cd620004SJunchao Zhang 542cd620004SJunchao Zhang PetscFunctionBegin; 543cd620004SJunchao Zhang /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 544cd620004SJunchao Zhang if (sf->persistent) { 545cd620004SJunchao Zhang if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 546cd620004SJunchao Zhang ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 547cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 548cd620004SJunchao Zhang for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 549cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 550cd620004SJunchao Zhang ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 551ffc4695bSBarry Smith ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 552cd620004SJunchao Zhang } 553cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 554cd620004SJunchao Zhang for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 555cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 556cd620004SJunchao Zhang ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 557ffc4695bSBarry Smith ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); 558cd620004SJunchao Zhang } 559cd620004SJunchao Zhang } 560cd620004SJunchao Zhang link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 561cd620004SJunchao Zhang } 562cd620004SJunchao Zhang 563cd620004SJunchao Zhang if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 564cd620004SJunchao Zhang ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 565cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 566cd620004SJunchao Zhang for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 567cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 568cd620004SJunchao Zhang ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 569ffc4695bSBarry Smith ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 570cd620004SJunchao Zhang } 571cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 572cd620004SJunchao Zhang for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 573cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 574cd620004SJunchao Zhang ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 575ffc4695bSBarry Smith ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); 576cd620004SJunchao Zhang } 577cd620004SJunchao Zhang } 578cd620004SJunchao Zhang link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 579cd620004SJunchao Zhang } 580cd620004SJunchao Zhang } 581cd620004SJunchao Zhang if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 582cd620004SJunchao Zhang if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 583cd620004SJunchao Zhang if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 584cd620004SJunchao Zhang if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 585cd620004SJunchao Zhang PetscFunctionReturn(0); 586cd620004SJunchao Zhang } 587cd620004SJunchao Zhang 588cd620004SJunchao Zhang 589cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 590cd620004SJunchao Zhang { 591cd620004SJunchao Zhang PetscErrorCode ierr; 592cd620004SJunchao Zhang PetscSFLink link,*p; 59340e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 59440e23c03SJunchao Zhang 59540e23c03SJunchao Zhang PetscFunctionBegin; 59640e23c03SJunchao Zhang /* Look for types in cache */ 59740e23c03SJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 59840e23c03SJunchao Zhang PetscBool match; 59940e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 600637e6665SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 60140e23c03SJunchao Zhang switch (cmode) { 60240e23c03SJunchao Zhang case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 60340e23c03SJunchao Zhang case PETSC_USE_POINTER: break; 60440e23c03SJunchao Zhang default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 60540e23c03SJunchao Zhang } 60640e23c03SJunchao Zhang *mylink = link; 60740e23c03SJunchao Zhang PetscFunctionReturn(0); 60840e23c03SJunchao Zhang } 60940e23c03SJunchao Zhang } 61040e23c03SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 61140e23c03SJunchao Zhang } 61240e23c03SJunchao Zhang 613*71438e86SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink) 61440e23c03SJunchao Zhang { 61540e23c03SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 616*71438e86SJunchao Zhang PetscSFLink link = *mylink; 61740e23c03SJunchao Zhang 61840e23c03SJunchao Zhang PetscFunctionBegin; 619*71438e86SJunchao Zhang link->rootdata = NULL; 620*71438e86SJunchao Zhang link->leafdata = NULL; 621*71438e86SJunchao Zhang link->next = bas->avail; 622*71438e86SJunchao Zhang bas->avail = link; 623*71438e86SJunchao Zhang *mylink = NULL; 624eb02082bSJunchao Zhang PetscFunctionReturn(0); 625eb02082bSJunchao Zhang } 626eb02082bSJunchao Zhang 6279d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */ 628cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 6299d1c8addSJunchao Zhang { 6309d1c8addSJunchao Zhang PetscErrorCode ierr; 631cd620004SJunchao Zhang PetscSFLink link,*p; 6329d1c8addSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 6339d1c8addSJunchao Zhang PetscBool match; 6349d1c8addSJunchao Zhang 6359d1c8addSJunchao Zhang PetscFunctionBegin; 636b458e8f1SJose E. Roman if (PetscDefined(USE_DEBUG)) { 63718fb5014SJunchao Zhang /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 63818fb5014SJunchao Zhang the potential overlapping since this process does not participate in communication. Overlapping is harmless. 63918fb5014SJunchao Zhang */ 640637e6665SJunchao Zhang if (rootdata || leafdata) { 6419d1c8addSJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 6429d1c8addSJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 643cd620004SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 6449d1c8addSJunchao Zhang } 64518fb5014SJunchao Zhang } 646b458e8f1SJose E. Roman } 6479d1c8addSJunchao Zhang PetscFunctionReturn(0); 6489d1c8addSJunchao Zhang } 6499d1c8addSJunchao Zhang 65020c24465SJunchao Zhang static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 65120c24465SJunchao Zhang { 65220c24465SJunchao Zhang PetscFunctionBegin; 65320c24465SJunchao Zhang if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 65420c24465SJunchao Zhang PetscFunctionReturn(0); 65520c24465SJunchao Zhang } 65620c24465SJunchao Zhang 65720c24465SJunchao Zhang 658cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 65940e23c03SJunchao Zhang { 66040e23c03SJunchao Zhang PetscErrorCode ierr; 661b23bfdefSJunchao Zhang PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 662b23bfdefSJunchao Zhang PetscBool is2Int,is2PetscInt; 66340e23c03SJunchao Zhang PetscMPIInt ni,na,nd,combiner; 66440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 665b23bfdefSJunchao Zhang PetscInt nPetscComplex=0; 66640e23c03SJunchao Zhang #endif 66740e23c03SJunchao Zhang 66840e23c03SJunchao Zhang PetscFunctionBegin; 669b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 670b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 671b23bfdefSJunchao Zhang /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 672b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 673b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 674b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 67540e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 676b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 67740e23c03SJunchao Zhang #endif 67840e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 67940e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 680b23bfdefSJunchao Zhang /* TODO: shaell we also handle Fortran MPI_2REAL? */ 681ffc4695bSBarry Smith ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr); 6825ad15460SJunchao Zhang link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 683b23bfdefSJunchao Zhang link->bs = 1; /* default */ 68440e23c03SJunchao Zhang 685eb02082bSJunchao Zhang if (is2Int) { 686cd620004SJunchao Zhang PackInit_PairType_int_int_1_1(link); 687eb02082bSJunchao Zhang link->bs = 1; 688eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(int); 6895ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 690eb02082bSJunchao Zhang link->basicunit = MPI_2INT; 6915ad15460SJunchao Zhang link->unit = MPI_2INT; 692eb02082bSJunchao Zhang } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 693cd620004SJunchao Zhang PackInit_PairType_PetscInt_PetscInt_1_1(link); 694eb02082bSJunchao Zhang link->bs = 1; 695eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(PetscInt); 696eb02082bSJunchao Zhang link->basicunit = MPIU_2INT; 6975ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 6985ad15460SJunchao Zhang link->unit = MPIU_2INT; 699eb02082bSJunchao Zhang } else if (nPetscReal) { 700b23bfdefSJunchao Zhang if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 701b23bfdefSJunchao Zhang else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 702b23bfdefSJunchao Zhang else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 703b23bfdefSJunchao Zhang else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 704b23bfdefSJunchao Zhang link->bs = nPetscReal; 705eb02082bSJunchao Zhang link->unitbytes = nPetscReal*sizeof(PetscReal); 70640e23c03SJunchao Zhang link->basicunit = MPIU_REAL; 7075ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 708b23bfdefSJunchao Zhang } else if (nPetscInt) { 709b23bfdefSJunchao Zhang if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 710b23bfdefSJunchao Zhang else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 711b23bfdefSJunchao Zhang else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 712b23bfdefSJunchao Zhang else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 713b23bfdefSJunchao Zhang link->bs = nPetscInt; 714eb02082bSJunchao Zhang link->unitbytes = nPetscInt*sizeof(PetscInt); 715b23bfdefSJunchao Zhang link->basicunit = MPIU_INT; 7165ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 717b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) 718b23bfdefSJunchao Zhang } else if (nInt) { 719b23bfdefSJunchao Zhang if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 720b23bfdefSJunchao Zhang else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 721b23bfdefSJunchao Zhang else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 722b23bfdefSJunchao Zhang else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 723b23bfdefSJunchao Zhang link->bs = nInt; 724eb02082bSJunchao Zhang link->unitbytes = nInt*sizeof(int); 725b23bfdefSJunchao Zhang link->basicunit = MPI_INT; 7265ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 727b23bfdefSJunchao Zhang #endif 728b23bfdefSJunchao Zhang } else if (nSignedChar) { 729b23bfdefSJunchao Zhang if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 730b23bfdefSJunchao Zhang else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 731b23bfdefSJunchao Zhang else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 732b23bfdefSJunchao Zhang else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 733b23bfdefSJunchao Zhang link->bs = nSignedChar; 734eb02082bSJunchao Zhang link->unitbytes = nSignedChar*sizeof(SignedChar); 735b23bfdefSJunchao Zhang link->basicunit = MPI_SIGNED_CHAR; 7365ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 737b23bfdefSJunchao Zhang } else if (nUnsignedChar) { 738b23bfdefSJunchao Zhang if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 739b23bfdefSJunchao Zhang else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 740b23bfdefSJunchao Zhang else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 741b23bfdefSJunchao Zhang else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 742b23bfdefSJunchao Zhang link->bs = nUnsignedChar; 743eb02082bSJunchao Zhang link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 744b23bfdefSJunchao Zhang link->basicunit = MPI_UNSIGNED_CHAR; 7455ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 74640e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 747b23bfdefSJunchao Zhang } else if (nPetscComplex) { 748b23bfdefSJunchao Zhang if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 749b23bfdefSJunchao Zhang else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 750b23bfdefSJunchao Zhang else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 751b23bfdefSJunchao Zhang else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 752b23bfdefSJunchao Zhang link->bs = nPetscComplex; 753eb02082bSJunchao Zhang link->unitbytes = nPetscComplex*sizeof(PetscComplex); 75440e23c03SJunchao Zhang link->basicunit = MPIU_COMPLEX; 7555ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 75640e23c03SJunchao Zhang #endif 75740e23c03SJunchao Zhang } else { 758b23bfdefSJunchao Zhang MPI_Aint lb,nbyte; 759ffc4695bSBarry Smith ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr); 76040e23c03SJunchao Zhang if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 761eb02082bSJunchao Zhang if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 762eb02082bSJunchao Zhang if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 763eb02082bSJunchao Zhang else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 764eb02082bSJunchao Zhang else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 765eb02082bSJunchao Zhang link->bs = nbyte; 766b23bfdefSJunchao Zhang link->unitbytes = nbyte; 767b23bfdefSJunchao Zhang link->basicunit = MPI_BYTE; 76840e23c03SJunchao Zhang } else { 769eb02082bSJunchao Zhang nInt = nbyte / sizeof(int); 770eb02082bSJunchao Zhang if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 771eb02082bSJunchao Zhang else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 772eb02082bSJunchao Zhang else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 773eb02082bSJunchao Zhang else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 774eb02082bSJunchao Zhang link->bs = nInt; 775b23bfdefSJunchao Zhang link->unitbytes = nbyte; 77640e23c03SJunchao Zhang link->basicunit = MPI_INT; 77740e23c03SJunchao Zhang } 7785ad15460SJunchao Zhang if (link->isbuiltin) link->unit = unit; 77940e23c03SJunchao Zhang } 780b23bfdefSJunchao Zhang 781ffc4695bSBarry Smith if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);} 78220c24465SJunchao Zhang 78320c24465SJunchao Zhang link->Memcpy = PetscSFLinkMemcpy_Host; 78440e23c03SJunchao Zhang PetscFunctionReturn(0); 78540e23c03SJunchao Zhang } 78640e23c03SJunchao Zhang 787fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) 78840e23c03SJunchao Zhang { 78940e23c03SJunchao Zhang PetscFunctionBegin; 79040e23c03SJunchao Zhang *UnpackAndOp = NULL; 791*71438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) { 79283df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 793eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 794eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 795eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 796eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 797eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 798eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 799eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 800eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 801eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 802eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 803eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 804eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 805eb02082bSJunchao Zhang } 8067fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 807*71438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) { 80883df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 809eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 810eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 811eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 812eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 813eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 814eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 815eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 816eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 817eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 818eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 819eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 820eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 821*71438e86SJunchao Zhang } else if (PetscMemTypeDevice(mtype) && atomic) { 82283df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 823eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 824eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 825eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 826eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 827eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 828eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 829eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 830eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 831eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 832eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 833eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 834eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 835eb02082bSJunchao Zhang } 836eb02082bSJunchao Zhang #endif 83740e23c03SJunchao Zhang PetscFunctionReturn(0); 83840e23c03SJunchao Zhang } 83940e23c03SJunchao Zhang 840fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) 84140e23c03SJunchao Zhang { 84240e23c03SJunchao Zhang PetscFunctionBegin; 843cd620004SJunchao Zhang *ScatterAndOp = NULL; 844*71438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) { 84583df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 846cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 847cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 848cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 849cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 850cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 851cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 852cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 853cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 854cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 855cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 856cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 857cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 858eb02082bSJunchao Zhang } 8597fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 860*71438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) { 86183df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 862cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 863cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 864cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 865cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 866cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 867cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 868cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 869cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 870cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 871cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 872cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 873cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 874*71438e86SJunchao Zhang } else if (PetscMemTypeDevice(mtype) && atomic) { 87583df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 876cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 877cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 878cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 879cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 880cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 881cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 882cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 883cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 884cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 885cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 886cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 887cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 888eb02082bSJunchao Zhang } 889eb02082bSJunchao Zhang #endif 890cd620004SJunchao Zhang PetscFunctionReturn(0); 891cd620004SJunchao Zhang } 892cd620004SJunchao Zhang 893fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) 894cd620004SJunchao Zhang { 895cd620004SJunchao Zhang PetscFunctionBegin; 896cd620004SJunchao Zhang *FetchAndOp = NULL; 897cd620004SJunchao Zhang if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 898*71438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 8997fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 900*71438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 901*71438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 902cd620004SJunchao Zhang #endif 903cd620004SJunchao Zhang PetscFunctionReturn(0); 904cd620004SJunchao Zhang } 905cd620004SJunchao Zhang 906fcc7397dSJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 907cd620004SJunchao Zhang { 908cd620004SJunchao Zhang PetscFunctionBegin; 909cd620004SJunchao Zhang *FetchAndOpLocal = NULL; 910cd620004SJunchao Zhang if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 911*71438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 9127fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 913*71438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 914*71438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 915cd620004SJunchao Zhang #endif 916cd620004SJunchao Zhang PetscFunctionReturn(0); 917cd620004SJunchao Zhang } 918cd620004SJunchao Zhang 919cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 920cd620004SJunchao Zhang { 921cd620004SJunchao Zhang PetscErrorCode ierr; 922cd620004SJunchao Zhang PetscLogDouble flops; 923cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 924cd620004SJunchao Zhang 925cd620004SJunchao Zhang PetscFunctionBegin; 92683df288dSJunchao Zhang if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 927cd620004SJunchao Zhang flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 9287fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 929*71438e86SJunchao Zhang if (PetscMemTypeDevice(link->rootmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 930cd620004SJunchao Zhang #endif 931cd620004SJunchao Zhang {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 932cd620004SJunchao Zhang } 933cd620004SJunchao Zhang PetscFunctionReturn(0); 934cd620004SJunchao Zhang } 935cd620004SJunchao Zhang 936cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 937cd620004SJunchao Zhang { 938cd620004SJunchao Zhang PetscLogDouble flops; 939cd620004SJunchao Zhang PetscErrorCode ierr; 940cd620004SJunchao Zhang 941cd620004SJunchao Zhang PetscFunctionBegin; 94283df288dSJunchao Zhang if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 943cd620004SJunchao Zhang flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 9447fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 945*71438e86SJunchao Zhang if (PetscMemTypeDevice(link->leafmtype)) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 946cd620004SJunchao Zhang #endif 947cd620004SJunchao Zhang {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 948cd620004SJunchao Zhang } 949cd620004SJunchao Zhang PetscFunctionReturn(0); 950cd620004SJunchao Zhang } 951cd620004SJunchao Zhang 952cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 953cd620004SJunchao Zhang Input Arguments: 954cd620004SJunchao Zhang +sf - The StarForest 955cd620004SJunchao Zhang .link - The link 956cd620004SJunchao Zhang .count - Number of entries to unpack 957cd620004SJunchao Zhang .start - The first index, significent when indices=NULL 958cd620004SJunchao Zhang .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 959cd620004SJunchao Zhang .buf - A contiguous buffer to unpack from 960cd620004SJunchao Zhang -op - Operation after unpack 961cd620004SJunchao Zhang 962cd620004SJunchao Zhang Output Arguments: 963cd620004SJunchao Zhang .data - The data to unpack to 964cd620004SJunchao Zhang */ 965cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 966cd620004SJunchao Zhang { 967cd620004SJunchao Zhang PetscFunctionBegin; 968cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 969cd620004SJunchao Zhang { 970cd620004SJunchao Zhang PetscErrorCode ierr; 971cd620004SJunchao Zhang PetscInt i; 972cd620004SJunchao Zhang PetscMPIInt n; 973cd620004SJunchao Zhang if (indices) { 974cd620004SJunchao Zhang /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 975cd620004SJunchao Zhang basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 976cd620004SJunchao Zhang */ 977ffc4695bSBarry Smith for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);} 978cd620004SJunchao Zhang } else { 979cd620004SJunchao Zhang ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 980ffc4695bSBarry Smith ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr); 981cd620004SJunchao Zhang } 982cd620004SJunchao Zhang } 983b458e8f1SJose E. Roman PetscFunctionReturn(0); 984cd620004SJunchao Zhang #else 985cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 986cd620004SJunchao Zhang #endif 987cd620004SJunchao Zhang } 988cd620004SJunchao Zhang 989fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) 990cd620004SJunchao Zhang { 991cd620004SJunchao Zhang PetscFunctionBegin; 992cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 993cd620004SJunchao Zhang { 994cd620004SJunchao Zhang PetscErrorCode ierr; 995cd620004SJunchao Zhang PetscInt i,disp; 996fcc7397dSJunchao Zhang if (!srcIdx) { 997fcc7397dSJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); 998fcc7397dSJunchao Zhang } else { 999cd620004SJunchao Zhang for (i=0; i<count; i++) { 1000fcc7397dSJunchao Zhang disp = dstIdx? dstIdx[i] : dstStart + i; 1001ffc4695bSBarry Smith ierr = MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr); 1002fcc7397dSJunchao Zhang } 1003cd620004SJunchao Zhang } 1004cd620004SJunchao Zhang } 1005b458e8f1SJose E. Roman PetscFunctionReturn(0); 1006cd620004SJunchao Zhang #else 1007cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1008cd620004SJunchao Zhang #endif 1009cd620004SJunchao Zhang } 1010cd620004SJunchao Zhang 1011cd620004SJunchao Zhang /*============================================================================= 1012cd620004SJunchao Zhang Pack/Unpack/Fetch/Scatter routines 1013cd620004SJunchao Zhang ============================================================================*/ 1014cd620004SJunchao Zhang 1015cd620004SJunchao Zhang /* Pack rootdata to rootbuf 1016cd620004SJunchao Zhang Input Arguments: 1017cd620004SJunchao Zhang + sf - The SF this packing works on. 1018cd620004SJunchao Zhang . link - It gives the memtype of the roots and also provides root buffer. 1019cd620004SJunchao Zhang . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1020cd620004SJunchao Zhang - rootdata - Where to read the roots. 1021cd620004SJunchao Zhang 1022cd620004SJunchao Zhang Notes: 1023cd620004SJunchao Zhang When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1024*71438e86SJunchao Zhang in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 1025cd620004SJunchao Zhang */ 1026*71438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1027cd620004SJunchao Zhang { 1028cd620004SJunchao Zhang PetscErrorCode ierr; 1029cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1030cd620004SJunchao Zhang PetscInt count,start; 1031fcc7397dSJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1032cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1033fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1034fcc7397dSJunchao Zhang 1035cd620004SJunchao Zhang PetscFunctionBegin; 1036cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1037*71438e86SJunchao Zhang if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1038fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1039cd620004SJunchao Zhang ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1040fcc7397dSJunchao Zhang ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1041cd620004SJunchao Zhang } 1042cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1043cd620004SJunchao Zhang PetscFunctionReturn(0); 1044cd620004SJunchao Zhang } 1045cd620004SJunchao Zhang 1046cd620004SJunchao Zhang /* Pack leafdata to leafbuf */ 1047*71438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1048cd620004SJunchao Zhang { 1049cd620004SJunchao Zhang PetscErrorCode ierr; 1050cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 1051cd620004SJunchao Zhang PetscInt count,start; 1052fcc7397dSJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1053cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 1054fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1055cd620004SJunchao Zhang 1056cd620004SJunchao Zhang PetscFunctionBegin; 1057cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1058*71438e86SJunchao Zhang if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1059fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1060cd620004SJunchao Zhang ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1061fcc7397dSJunchao Zhang ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1062cd620004SJunchao Zhang } 1063cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1064cd620004SJunchao Zhang PetscFunctionReturn(0); 1065cd620004SJunchao Zhang } 1066cd620004SJunchao Zhang 1067*71438e86SJunchao Zhang /* Pack rootdata to rootbuf, which are in the same memory space */ 1068*71438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1069*71438e86SJunchao Zhang { 1070*71438e86SJunchao Zhang PetscErrorCode ierr; 1071*71438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1072*71438e86SJunchao Zhang 1073*71438e86SJunchao Zhang PetscFunctionBegin; 1074*71438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 1075*71438e86SJunchao Zhang if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1076*71438e86SJunchao Zhang if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1077*71438e86SJunchao Zhang } 1078*71438e86SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1079*71438e86SJunchao Zhang if (bas->rootbuflen[scope]) {ierr = PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);CHKERRQ(ierr);} 1080*71438e86SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1081*71438e86SJunchao Zhang PetscFunctionReturn(0); 1082*71438e86SJunchao Zhang } 1083*71438e86SJunchao Zhang /* Pack leafdata to leafbuf, which are in the same memory space */ 1084*71438e86SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1085*71438e86SJunchao Zhang { 1086*71438e86SJunchao Zhang PetscErrorCode ierr; 1087*71438e86SJunchao Zhang 1088*71438e86SJunchao Zhang PetscFunctionBegin; 1089*71438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { 1090*71438e86SJunchao Zhang if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1091*71438e86SJunchao Zhang if (link->PrePack) {ierr = (*link->PrePack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1092*71438e86SJunchao Zhang } 1093*71438e86SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1094*71438e86SJunchao Zhang if (sf->leafbuflen[scope]) {ierr = PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);CHKERRQ(ierr);} 1095*71438e86SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1096*71438e86SJunchao Zhang PetscFunctionReturn(0); 1097*71438e86SJunchao Zhang } 1098*71438e86SJunchao Zhang 1099*71438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1100cd620004SJunchao Zhang { 1101cd620004SJunchao Zhang PetscErrorCode ierr; 1102cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1103cd620004SJunchao Zhang PetscInt count,start; 1104cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1105fcc7397dSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1106cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1107fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1108cd620004SJunchao Zhang 1109cd620004SJunchao Zhang PetscFunctionBegin; 1110*71438e86SJunchao Zhang if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1111cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1112cd620004SJunchao Zhang if (UnpackAndOp) { 1113fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1114fcc7397dSJunchao Zhang ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1115cd620004SJunchao Zhang } else { 1116fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1117cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1118cd620004SJunchao Zhang } 1119cd620004SJunchao Zhang } 1120cd620004SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1121cd620004SJunchao Zhang PetscFunctionReturn(0); 1122cd620004SJunchao Zhang } 1123cd620004SJunchao Zhang 1124*71438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1125cd620004SJunchao Zhang { 1126cd620004SJunchao Zhang PetscErrorCode ierr; 1127cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 1128cd620004SJunchao Zhang PetscInt count,start; 1129fcc7397dSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; 1130cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 1131fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1132cd620004SJunchao Zhang 1133cd620004SJunchao Zhang PetscFunctionBegin; 1134*71438e86SJunchao Zhang if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1135cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1136cd620004SJunchao Zhang if (UnpackAndOp) { 1137fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1138fcc7397dSJunchao Zhang ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1139cd620004SJunchao Zhang } else { 1140fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); 1141cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1142cd620004SJunchao Zhang } 1143cd620004SJunchao Zhang } 1144cd620004SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1145*71438e86SJunchao Zhang PetscFunctionReturn(0); 1146*71438e86SJunchao Zhang } 1147*71438e86SJunchao Zhang /* Unpack rootbuf to rootdata, which are in the same memory space */ 1148*71438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1149*71438e86SJunchao Zhang { 1150*71438e86SJunchao Zhang PetscErrorCode ierr; 1151*71438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1152*71438e86SJunchao Zhang 1153*71438e86SJunchao Zhang PetscFunctionBegin; 1154*71438e86SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1155*71438e86SJunchao Zhang if (bas->rootbuflen[scope]) {ierr = PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);CHKERRQ(ierr);} 1156cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1157*71438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { 1158*71438e86SJunchao Zhang if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_LEAF2ROOT);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1159*71438e86SJunchao Zhang if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1160*71438e86SJunchao Zhang } 1161cd620004SJunchao Zhang PetscFunctionReturn(0); 1162cd620004SJunchao Zhang } 1163cd620004SJunchao Zhang 1164*71438e86SJunchao Zhang /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 1165*71438e86SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1166*71438e86SJunchao Zhang { 1167*71438e86SJunchao Zhang PetscErrorCode ierr; 1168*71438e86SJunchao Zhang 1169*71438e86SJunchao Zhang PetscFunctionBegin; 1170*71438e86SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1171*71438e86SJunchao Zhang if (sf->leafbuflen[scope]) {ierr = PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);} 1172*71438e86SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1173*71438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { 1174*71438e86SJunchao Zhang if (link->PostUnpack) {ierr = (*link->PostUnpack)(sf,link,PETSCSF_ROOT2LEAF);CHKERRQ(ierr);} /* Used by SF nvshmem */ 1175*71438e86SJunchao Zhang if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {ierr = (*link->SyncDevice)(link);CHKERRQ(ierr);} 1176*71438e86SJunchao Zhang } 1177*71438e86SJunchao Zhang PetscFunctionReturn(0); 1178*71438e86SJunchao Zhang } 1179*71438e86SJunchao Zhang 1180*71438e86SJunchao Zhang /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 1181*71438e86SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op) 1182cd620004SJunchao Zhang { 1183cd620004SJunchao Zhang PetscErrorCode ierr; 1184cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1185cd620004SJunchao Zhang PetscInt count,start; 1186cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1187fcc7397dSJunchao Zhang PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; 1188cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1189fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1190cd620004SJunchao Zhang 1191cd620004SJunchao Zhang PetscFunctionBegin; 1192cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1193*71438e86SJunchao Zhang if (bas->rootbuflen[PETSCSF_REMOTE]) { 1194cd620004SJunchao Zhang /* Do FetchAndOp on rootdata with rootbuf */ 1195*71438e86SJunchao Zhang ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);CHKERRQ(ierr); 1196*71438e86SJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);CHKERRQ(ierr); 1197*71438e86SJunchao Zhang ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);CHKERRQ(ierr); 1198cd620004SJunchao Zhang } 1199*71438e86SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);CHKERRQ(ierr); 1200cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1201cd620004SJunchao Zhang PetscFunctionReturn(0); 1202cd620004SJunchao Zhang } 1203cd620004SJunchao Zhang 1204*71438e86SJunchao Zhang PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op) 1205cd620004SJunchao Zhang { 1206cd620004SJunchao Zhang PetscErrorCode ierr; 1207cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1208cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1209cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1210fcc7397dSJunchao Zhang PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; 1211*71438e86SJunchao Zhang PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype; 1212fcc7397dSJunchao Zhang PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1213*71438e86SJunchao Zhang PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 1214*71438e86SJunchao Zhang char *srcbuf = NULL,*dstbuf = NULL; 1215*71438e86SJunchao Zhang PetscBool dstdups; 1216cd620004SJunchao Zhang 1217*71438e86SJunchao Zhang if (!buflen) PetscFunctionReturn(0); 1218*71438e86SJunchao Zhang if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 1219*71438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 1220cd620004SJunchao Zhang ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1221*71438e86SJunchao Zhang srcmtype = rootmtype; 1222*71438e86SJunchao Zhang srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1223*71438e86SJunchao Zhang dstmtype = leafmtype; 1224*71438e86SJunchao Zhang dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1225*71438e86SJunchao Zhang } else { 1226*71438e86SJunchao Zhang ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1227*71438e86SJunchao Zhang srcmtype = leafmtype; 1228*71438e86SJunchao Zhang srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 1229*71438e86SJunchao Zhang dstmtype = rootmtype; 1230*71438e86SJunchao Zhang dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 1231*71438e86SJunchao Zhang } 1232*71438e86SJunchao Zhang ierr = (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);CHKERRQ(ierr); 1233*71438e86SJunchao Zhang /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 1234*71438e86SJunchao Zhang if (PetscMemTypeHost(dstmtype)) {ierr = (*link->SyncStream)(link);CHKERRQ(ierr);} 1235*71438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 1236cd620004SJunchao Zhang ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1237cd620004SJunchao Zhang } else { 1238*71438e86SJunchao Zhang ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1239*71438e86SJunchao Zhang } 1240*71438e86SJunchao Zhang } else { 1241*71438e86SJunchao Zhang dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 1242*71438e86SJunchao Zhang dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 1243*71438e86SJunchao Zhang ierr = PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);CHKERRQ(ierr); 1244cd620004SJunchao Zhang if (ScatterAndOp) { 1245fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1246fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1247*71438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 1248fcc7397dSJunchao Zhang ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); 1249cd620004SJunchao Zhang } else { 1250fcc7397dSJunchao Zhang ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); 1251*71438e86SJunchao Zhang } 1252cd620004SJunchao Zhang } else { 1253fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1254fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1255*71438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 1256*71438e86SJunchao Zhang ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1257*71438e86SJunchao Zhang } else { 1258fcc7397dSJunchao Zhang ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1259cd620004SJunchao Zhang } 1260cd620004SJunchao Zhang } 1261*71438e86SJunchao Zhang } 1262cd620004SJunchao Zhang PetscFunctionReturn(0); 1263cd620004SJunchao Zhang } 1264cd620004SJunchao Zhang 1265cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */ 1266cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1267cd620004SJunchao Zhang { 1268cd620004SJunchao Zhang PetscErrorCode ierr; 1269cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1270cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1271cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1272fcc7397dSJunchao Zhang PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; 1273cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1274fcc7397dSJunchao Zhang PetscSFPackOpt leafopt = NULL,rootopt = NULL; 1275cd620004SJunchao Zhang 1276cd620004SJunchao Zhang PetscFunctionBegin; 1277cd620004SJunchao Zhang if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1278cd620004SJunchao Zhang if (rootmtype != leafmtype) { 1279cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */ 1280cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1281cd620004SJunchao Zhang } else { 1282fcc7397dSJunchao Zhang ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); 1283fcc7397dSJunchao Zhang ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); 1284cd620004SJunchao Zhang ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1285fcc7397dSJunchao Zhang ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1286cd620004SJunchao Zhang } 128740e23c03SJunchao Zhang PetscFunctionReturn(0); 128840e23c03SJunchao Zhang } 128940e23c03SJunchao Zhang 129040e23c03SJunchao Zhang /* 1291cd620004SJunchao Zhang Create per-rank pack/unpack optimizations based on indice patterns 129240e23c03SJunchao Zhang 129340e23c03SJunchao Zhang Input Parameters: 1294fcc7397dSJunchao Zhang + n - Number of destination ranks 1295eb02082bSJunchao Zhang . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1296b23bfdefSJunchao Zhang - idx - [*] Array storing indices 129740e23c03SJunchao Zhang 129840e23c03SJunchao Zhang Output Parameters: 1299cd620004SJunchao Zhang + opt - Pack optimizations. NULL if no optimizations. 130040e23c03SJunchao Zhang */ 1301cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 130240e23c03SJunchao Zhang { 130340e23c03SJunchao Zhang PetscErrorCode ierr; 1304fcc7397dSJunchao Zhang PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; 1305fcc7397dSJunchao Zhang PetscBool optimizable = PETSC_TRUE; 130640e23c03SJunchao Zhang PetscSFPackOpt opt; 130740e23c03SJunchao Zhang 130840e23c03SJunchao Zhang PetscFunctionBegin; 1309fcc7397dSJunchao Zhang ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); 1310fcc7397dSJunchao Zhang ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); 1311fcc7397dSJunchao Zhang opt->n = opt->array[0] = n; 1312fcc7397dSJunchao Zhang opt->offset = opt->array + 1; 1313fcc7397dSJunchao Zhang opt->start = opt->array + n + 2; 1314fcc7397dSJunchao Zhang opt->dx = opt->array + 2*n + 2; 1315fcc7397dSJunchao Zhang opt->dy = opt->array + 3*n + 2; 1316fcc7397dSJunchao Zhang opt->dz = opt->array + 4*n + 2; 1317fcc7397dSJunchao Zhang opt->X = opt->array + 5*n + 2; 1318fcc7397dSJunchao Zhang opt->Y = opt->array + 6*n + 2; 1319fcc7397dSJunchao Zhang 1320fcc7397dSJunchao Zhang for (r=0; r<n; r++) { /* For each destination rank */ 1321fcc7397dSJunchao Zhang m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1322fcc7397dSJunchao Zhang p = offset[r]; 1323fcc7397dSJunchao Zhang start = idx[p]; /* First index for this rank */ 1324fcc7397dSJunchao Zhang p++; 1325fcc7397dSJunchao Zhang 1326fcc7397dSJunchao Zhang /* Search in X dimension */ 1327fcc7397dSJunchao Zhang for (dx=1; dx<m; dx++,p++) { 1328fcc7397dSJunchao Zhang if (start+dx != idx[p]) break; 1329b23bfdefSJunchao Zhang } 1330b23bfdefSJunchao Zhang 1331fcc7397dSJunchao Zhang dydz = m/dx; 1332fcc7397dSJunchao Zhang X = dydz > 1 ? (idx[p]-start) : dx; 1333fcc7397dSJunchao Zhang /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1334fcc7397dSJunchao Zhang if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} 1335fcc7397dSJunchao Zhang for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */ 1336fcc7397dSJunchao Zhang for (i=0; i<dx; i++,p++) { 1337fcc7397dSJunchao Zhang if (start+X*dy+i != idx[p]) { 1338fcc7397dSJunchao Zhang if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */ 1339fcc7397dSJunchao Zhang else goto Z_dimension; 134040e23c03SJunchao Zhang } 134140e23c03SJunchao Zhang } 134240e23c03SJunchao Zhang } 134340e23c03SJunchao Zhang 1344fcc7397dSJunchao Zhang Z_dimension: 1345fcc7397dSJunchao Zhang dz = m/(dx*dy); 1346fcc7397dSJunchao Zhang Y = dz > 1 ? (idx[p]-start)/X : dy; 1347fcc7397dSJunchao Zhang /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1348fcc7397dSJunchao Zhang if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} 1349fcc7397dSJunchao Zhang for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1350fcc7397dSJunchao Zhang for (j=0; j<dy; j++) { 1351fcc7397dSJunchao Zhang for (i=0; i<dx; i++,p++) { 1352fcc7397dSJunchao Zhang if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;} 135340e23c03SJunchao Zhang } 135440e23c03SJunchao Zhang } 135540e23c03SJunchao Zhang } 1356fcc7397dSJunchao Zhang opt->start[r] = start; 1357fcc7397dSJunchao Zhang opt->dx[r] = dx; 1358fcc7397dSJunchao Zhang opt->dy[r] = dy; 1359fcc7397dSJunchao Zhang opt->dz[r] = dz; 1360fcc7397dSJunchao Zhang opt->X[r] = X; 1361fcc7397dSJunchao Zhang opt->Y[r] = Y; 136240e23c03SJunchao Zhang } 136340e23c03SJunchao Zhang 1364fcc7397dSJunchao Zhang finish: 1365fcc7397dSJunchao Zhang /* If not optimizable, free arrays to save memory */ 1366fcc7397dSJunchao Zhang if (!n || !optimizable) { 1367fcc7397dSJunchao Zhang ierr = PetscFree(opt->array);CHKERRQ(ierr); 136840e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 136940e23c03SJunchao Zhang *out = NULL; 1370fcc7397dSJunchao Zhang } else { 1371fcc7397dSJunchao Zhang opt->offset[0] = 0; 1372fcc7397dSJunchao Zhang for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; 1373fcc7397dSJunchao Zhang *out = opt; 1374fcc7397dSJunchao Zhang } 137540e23c03SJunchao Zhang PetscFunctionReturn(0); 137640e23c03SJunchao Zhang } 137740e23c03SJunchao Zhang 137820c24465SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) 137940e23c03SJunchao Zhang { 138040e23c03SJunchao Zhang PetscErrorCode ierr; 138140e23c03SJunchao Zhang PetscSFPackOpt opt = *out; 138240e23c03SJunchao Zhang 138340e23c03SJunchao Zhang PetscFunctionBegin; 138440e23c03SJunchao Zhang if (opt) { 138520c24465SJunchao Zhang ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr); 138640e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 138740e23c03SJunchao Zhang *out = NULL; 138840e23c03SJunchao Zhang } 138940e23c03SJunchao Zhang PetscFunctionReturn(0); 139040e23c03SJunchao Zhang } 1391cd620004SJunchao Zhang 1392cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1393cd620004SJunchao Zhang { 1394cd620004SJunchao Zhang PetscErrorCode ierr; 1395cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1396cd620004SJunchao Zhang PetscInt i,j; 1397cd620004SJunchao Zhang 1398cd620004SJunchao Zhang PetscFunctionBegin; 1399cd620004SJunchao Zhang /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1400cd620004SJunchao Zhang for (i=0; i<2; i++) { /* Set defaults */ 1401cd620004SJunchao Zhang sf->leafstart[i] = 0; 1402cd620004SJunchao Zhang sf->leafcontig[i] = PETSC_TRUE; 1403cd620004SJunchao Zhang sf->leafdups[i] = PETSC_FALSE; 1404cd620004SJunchao Zhang bas->rootstart[i] = 0; 1405cd620004SJunchao Zhang bas->rootcontig[i] = PETSC_TRUE; 1406cd620004SJunchao Zhang bas->rootdups[i] = PETSC_FALSE; 1407cd620004SJunchao Zhang } 1408cd620004SJunchao Zhang 1409cd620004SJunchao Zhang sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1410cd620004SJunchao Zhang sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1411cd620004SJunchao Zhang 1412cd620004SJunchao Zhang if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1413cd620004SJunchao Zhang if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1414cd620004SJunchao Zhang 1415cd620004SJunchao Zhang /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1416cd620004SJunchao Zhang for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1417cd620004SJunchao Zhang if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1418cd620004SJunchao Zhang } 1419cd620004SJunchao Zhang for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1420cd620004SJunchao Zhang if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1421cd620004SJunchao Zhang } 1422cd620004SJunchao Zhang 1423cd620004SJunchao Zhang /* If not, see if we can have per-rank optimizations by doing index analysis */ 1424cd620004SJunchao Zhang if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1425cd620004SJunchao Zhang if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1426cd620004SJunchao Zhang 1427cd620004SJunchao Zhang /* Are root indices for self and remote contiguous? */ 1428cd620004SJunchao Zhang bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1429cd620004SJunchao Zhang bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1430cd620004SJunchao Zhang 1431cd620004SJunchao Zhang if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1432cd620004SJunchao Zhang if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1433cd620004SJunchao Zhang 1434cd620004SJunchao Zhang for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1435cd620004SJunchao Zhang if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1436cd620004SJunchao Zhang } 1437cd620004SJunchao Zhang for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1438cd620004SJunchao Zhang if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1439cd620004SJunchao Zhang } 1440cd620004SJunchao Zhang 1441cd620004SJunchao Zhang if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1442cd620004SJunchao Zhang if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1443cd620004SJunchao Zhang 14447fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 1445cd620004SJunchao Zhang /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1446013b3241SStefano Zampini if (PetscDefined(HAVE_DEVICE)) { 1447013b3241SStefano Zampini PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 1448013b3241SStefano Zampini if (!sf->leafcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1449013b3241SStefano Zampini if (!sf->leafcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1450013b3241SStefano Zampini if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1451013b3241SStefano Zampini if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1452013b3241SStefano Zampini } 1453cd620004SJunchao Zhang #endif 1454cd620004SJunchao Zhang PetscFunctionReturn(0); 1455cd620004SJunchao Zhang } 1456cd620004SJunchao Zhang 1457cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1458cd620004SJunchao Zhang { 1459cd620004SJunchao Zhang PetscErrorCode ierr; 1460cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1461cd620004SJunchao Zhang PetscInt i; 1462cd620004SJunchao Zhang 1463cd620004SJunchao Zhang PetscFunctionBegin; 1464cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 146520c24465SJunchao Zhang ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); 146620c24465SJunchao Zhang ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); 14677fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 146820c24465SJunchao Zhang ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); 146920c24465SJunchao Zhang ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); 14707fd2d3dbSJunchao Zhang #endif 1471cd620004SJunchao Zhang } 1472cd620004SJunchao Zhang PetscFunctionReturn(0); 1473cd620004SJunchao Zhang } 1474