140e23c03SJunchao Zhang #if !defined(__SFBASIC_H) 240e23c03SJunchao Zhang #define __SFBASIC_H 340e23c03SJunchao Zhang 440e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 540e23c03SJunchao Zhang 640e23c03SJunchao Zhang typedef enum {PETSCSF_LEAF2ROOT_REDUCE=0, PETSCSF_ROOT2LEAF_BCAST=1} PetscSFDirection; 740e23c03SJunchao Zhang 840e23c03SJunchao Zhang #define SFBASICHEADER \ 940e23c03SJunchao Zhang PetscMPIInt niranks; /* Number of incoming ranks (ranks accessing my roots) */ \ 1040e23c03SJunchao Zhang PetscMPIInt ndiranks; /* Number of incoming ranks (ranks accessing my roots) in distinguished set */ \ 1140e23c03SJunchao Zhang PetscMPIInt *iranks; /* Array of ranks that reference my roots */ \ 1240e23c03SJunchao Zhang PetscInt itotal; /* Total number of graph edges referencing my roots */ \ 1340e23c03SJunchao Zhang PetscInt *ioffset; /* Array of length niranks+1 holding offset in irootloc[] for each rank */ \ 1440e23c03SJunchao Zhang PetscInt *irootloc; /* Incoming roots referenced by ranks starting at ioffset[rank] */ \ 15*eb02082bSJunchao Zhang PetscInt *irootloc_d; /* irootloc in device memory when needed */ \ 1640e23c03SJunchao Zhang PetscSFPackOpt rootpackopt; /* Optimization plans to (un)pack roots based on patterns in irootloc[]. NULL for no plans */ \ 17b23bfdefSJunchao Zhang PetscSFPackOpt selfrootpackopt; /* Optimization plans to (un)pack roots connected to local leaves */ \ 1840e23c03SJunchao Zhang PetscSFPack avail; /* One or more entries per MPI Datatype, lazily constructed */ \ 19*eb02082bSJunchao Zhang PetscSFPack inuse; /* Buffers being used for transactions that have not yet completed */ \ 20*eb02082bSJunchao Zhang PetscBool selfrootdups; /* Indices of roots in irootloc[0,ioffset[ndiranks]) have dups, implying theads working ... */ \ 21*eb02082bSJunchao Zhang /* ... on these roots in parallel may have data race. */ \ 22*eb02082bSJunchao Zhang PetscBool remoterootdups /* Indices of roots in irootloc[ioffset[ndiranks],ioffset[niranks]) have dups */ 2340e23c03SJunchao Zhang 2440e23c03SJunchao Zhang typedef struct { 2540e23c03SJunchao Zhang SFBASICHEADER; 2640e23c03SJunchao Zhang } PetscSF_Basic; 2740e23c03SJunchao Zhang 2840e23c03SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootInfo_Basic(PetscSF sf,PetscInt *nrootranks,PetscInt *ndrootranks,const PetscMPIInt **rootranks,const PetscInt **rootoffset,const PetscInt **rootloc) 2940e23c03SJunchao Zhang { 3040e23c03SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 3140e23c03SJunchao Zhang 3240e23c03SJunchao Zhang PetscFunctionBegin; 3340e23c03SJunchao Zhang if (nrootranks) *nrootranks = bas->niranks; 3440e23c03SJunchao Zhang if (ndrootranks) *ndrootranks = bas->ndiranks; 3540e23c03SJunchao Zhang if (rootranks) *rootranks = bas->iranks; 3640e23c03SJunchao Zhang if (rootoffset) *rootoffset = bas->ioffset; 3740e23c03SJunchao Zhang if (rootloc) *rootloc = bas->irootloc; 3840e23c03SJunchao Zhang PetscFunctionReturn(0); 3940e23c03SJunchao Zhang } 4040e23c03SJunchao Zhang 4140e23c03SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafInfo_Basic(PetscSF sf,PetscInt *nleafranks,PetscInt *ndleafranks,const PetscMPIInt **leafranks,const PetscInt **leafoffset,const PetscInt **leafloc,const PetscInt **leafrremote) 4240e23c03SJunchao Zhang { 4340e23c03SJunchao Zhang PetscFunctionBegin; 4440e23c03SJunchao Zhang if (nleafranks) *nleafranks = sf->nranks; 4540e23c03SJunchao Zhang if (ndleafranks) *ndleafranks = sf->ndranks; 4640e23c03SJunchao Zhang if (leafranks) *leafranks = sf->ranks; 4740e23c03SJunchao Zhang if (leafoffset) *leafoffset = sf->roffset; 4840e23c03SJunchao Zhang if (leafloc) *leafloc = sf->rmine; 4940e23c03SJunchao Zhang if (leafrremote) *leafrremote = sf->rremote; 5040e23c03SJunchao Zhang PetscFunctionReturn(0); 5140e23c03SJunchao Zhang } 5240e23c03SJunchao Zhang 53*eb02082bSJunchao Zhang /* Get root locations either on Host or Device */ 54*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootIndicesWithMemType_Basic(PetscSF sf,PetscMemType mtype, const PetscInt **rootloc) 55b23bfdefSJunchao Zhang { 56b23bfdefSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 57b23bfdefSJunchao Zhang PetscFunctionBegin; 58*eb02082bSJunchao Zhang if (rootloc) { 59*eb02082bSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) *rootloc = bas->irootloc; 60*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 61*eb02082bSJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE) { 62*eb02082bSJunchao Zhang if (!bas->irootloc_d) { 63*eb02082bSJunchao Zhang cudaError_t err; 64*eb02082bSJunchao Zhang size_t size = bas->ioffset[bas->niranks]*sizeof(PetscInt); 65*eb02082bSJunchao Zhang err = cudaMalloc((void **)&bas->irootloc_d,size);CHKERRCUDA(err); 66*eb02082bSJunchao Zhang err = cudaMemcpy(bas->irootloc_d,bas->irootloc,size,cudaMemcpyHostToDevice);CHKERRCUDA(err); 67*eb02082bSJunchao Zhang } 68*eb02082bSJunchao Zhang *rootloc = bas->irootloc_d; 69*eb02082bSJunchao Zhang } 70*eb02082bSJunchao Zhang #endif 71*eb02082bSJunchao Zhang else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType in getting rootloc %d",(int)mtype); 72*eb02082bSJunchao Zhang } 73b23bfdefSJunchao Zhang PetscFunctionReturn(0); 74b23bfdefSJunchao Zhang } 75b23bfdefSJunchao Zhang 76b23bfdefSJunchao Zhang /* Get leaf locations either on Host (CPU) or Device (GPU) */ 77*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafIndicesWithMemType_Basic(PetscSF sf,PetscMemType mtype, const PetscInt **leafloc) 78b23bfdefSJunchao Zhang { 79b23bfdefSJunchao Zhang PetscFunctionBegin; 80*eb02082bSJunchao Zhang if (leafloc) { 81*eb02082bSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) *leafloc = sf->rmine; 82*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 83*eb02082bSJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE) { 84*eb02082bSJunchao Zhang if (!sf->rmine_d) { 85*eb02082bSJunchao Zhang cudaError_t err; 86*eb02082bSJunchao Zhang size_t size = sf->roffset[sf->nranks]*sizeof(PetscInt); 87*eb02082bSJunchao Zhang err = cudaMalloc((void **)&sf->rmine_d,size);CHKERRCUDA(err); 88*eb02082bSJunchao Zhang err = cudaMemcpy(sf->rmine_d,sf->rmine,size,cudaMemcpyHostToDevice);CHKERRCUDA(err); 89*eb02082bSJunchao Zhang } 90*eb02082bSJunchao Zhang *leafloc = sf->rmine_d; 91*eb02082bSJunchao Zhang } 92*eb02082bSJunchao Zhang #endif 93*eb02082bSJunchao Zhang else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType in getting leafloc %d",(int)mtype); 94*eb02082bSJunchao Zhang } 95b23bfdefSJunchao Zhang PetscFunctionReturn(0); 96b23bfdefSJunchao Zhang } 97b23bfdefSJunchao Zhang 98*eb02082bSJunchao Zhang /* A convenience struct to provide info to the following (un)packing routines so that we can treat packings to self and to remote in one loop. */ 99*eb02082bSJunchao Zhang typedef struct _n_PackInfo { 100b23bfdefSJunchao Zhang PetscInt count; /* Number of entries to pack, unpack etc. */ 101*eb02082bSJunchao Zhang const PetscInt *idx; /* Indices of the entries. NULL for contiguous indices [0,count-1) */ 102b23bfdefSJunchao Zhang PetscSFPackOpt opt; /* Pack optimizations */ 103b23bfdefSJunchao Zhang char *buf; /* The contiguous buffer where we pack to or unpack from */ 104*eb02082bSJunchao Zhang PetscBool atomic; /* Whether the unpack routine needs to use atomics */ 105b23bfdefSJunchao Zhang } PackInfo; 106b23bfdefSJunchao Zhang 107*eb02082bSJunchao Zhang /* Utility routine to pack selected entries of rootdata into root buffer. 108*eb02082bSJunchao Zhang Input Arguments: 109*eb02082bSJunchao Zhang + sf - The SF this packing works on. 110*eb02082bSJunchao Zhang . link - The PetscSFPack, which gives the memtype of the roots and also provides root buffer. 111*eb02082bSJunchao Zhang . rootloc - Indices of the roots, only meaningful if the root space is sparse 112*eb02082bSJunchao Zhang . rootdata - Where to read the roots. 113*eb02082bSJunchao Zhang - sparse - Is the root space sparse (for SFBasic, SFNeighbor) or dense (for SFAllgatherv etc) 114*eb02082bSJunchao Zhang */ 115*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,const void *rootdata,PetscBool sparse) 116b23bfdefSJunchao Zhang { 117b23bfdefSJunchao Zhang PetscErrorCode ierr; 118b23bfdefSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 119b23bfdefSJunchao Zhang PetscInt i; 120*eb02082bSJunchao Zhang PetscErrorCode (*Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*); 121*eb02082bSJunchao Zhang PackInfo p[2]; 122b23bfdefSJunchao Zhang 123b23bfdefSJunchao Zhang PetscFunctionBegin; 124*eb02082bSJunchao Zhang if (sparse) { 125*eb02082bSJunchao Zhang /* For SFBasic and SFNeighbor, whose root space is sparse and have separate buffers for self and remote. */ 126*eb02082bSJunchao Zhang p[0].count = bas->ioffset[bas->ndiranks]; p[1].count = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks]; 127*eb02082bSJunchao Zhang p[0].idx = rootloc; p[1].idx = rootloc+bas->ioffset[bas->ndiranks]; 128*eb02082bSJunchao Zhang p[0].opt = bas->selfrootpackopt; p[1].opt = bas->rootpackopt; 129*eb02082bSJunchao Zhang p[0].buf = link->selfbuf[link->rootmtype]; p[1].buf = link->rootbuf[link->rootmtype]; 130*eb02082bSJunchao Zhang p[0].atomic = PETSC_FALSE; p[1].atomic = PETSC_FALSE; 131*eb02082bSJunchao Zhang } else { 132*eb02082bSJunchao Zhang /* For SFAllgatherv etc collectives, which have a dense root space and do not differentiate self/remote communication. */ 133*eb02082bSJunchao Zhang p[0].count = sf->nroots; 134*eb02082bSJunchao Zhang p[0].idx = NULL; 135*eb02082bSJunchao Zhang p[0].opt = NULL; 136*eb02082bSJunchao Zhang p[0].buf = link->rootbuf[link->rootmtype]; 137*eb02082bSJunchao Zhang p[0].atomic = PETSC_FALSE; 138*eb02082bSJunchao Zhang p[1].count = 0; 139*eb02082bSJunchao Zhang } 140*eb02082bSJunchao Zhang 141*eb02082bSJunchao Zhang ierr = PetscSFPackGetPack(link,link->rootmtype,&Pack);CHKERRQ(ierr); 142b23bfdefSJunchao Zhang /* Only do packing when count != 0 so that we can avoid invoking CUDA kernels on GPU. */ 143*eb02082bSJunchao Zhang for (i=0; i<2; i++) {if (p[i].count) {ierr = (*Pack)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);}} 144*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 145*eb02082bSJunchao Zhang /* Let's assume MPI does not do cudaDeviceSynchronize() at entrance, which I do not know. Then we have to sync the packing kernel to make rootbuf & selfbuf ready for MPI & self to self copy */ 146*eb02082bSJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);} 147*eb02082bSJunchao Zhang #endif 148b23bfdefSJunchao Zhang PetscFunctionReturn(0); 149b23bfdefSJunchao Zhang } 150b23bfdefSJunchao Zhang 151b23bfdefSJunchao Zhang /* Utility routine to pack selected entries of leafdata into leaf buffer */ 152*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackLeafData(PetscSF sf,PetscSFPack link,const PetscInt *leafloc,const void *leafdata,PetscBool sparse) 153b23bfdefSJunchao Zhang { 154b23bfdefSJunchao Zhang PetscErrorCode ierr; 155b23bfdefSJunchao Zhang PetscInt i; 156*eb02082bSJunchao Zhang PetscErrorCode (*Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*); 157*eb02082bSJunchao Zhang PackInfo p[2]; 158b23bfdefSJunchao Zhang 159b23bfdefSJunchao Zhang PetscFunctionBegin; 160*eb02082bSJunchao Zhang if (sparse) { 161*eb02082bSJunchao Zhang p[0].count = sf->roffset[sf->ndranks]; p[1].count = sf->roffset[sf->nranks]-sf->roffset[sf->ndranks]; 162*eb02082bSJunchao Zhang p[0].idx = leafloc; p[1].idx = leafloc+sf->roffset[sf->ndranks]; 163*eb02082bSJunchao Zhang p[0].opt = sf->selfleafpackopt; p[1].opt = sf->leafpackopt; 164*eb02082bSJunchao Zhang p[0].buf = link->selfbuf[link->leafmtype]; p[1].buf = link->leafbuf[link->leafmtype]; 165*eb02082bSJunchao Zhang p[0].atomic = PETSC_FALSE; p[1].atomic = PETSC_FALSE; 166*eb02082bSJunchao Zhang } else { 167*eb02082bSJunchao Zhang p[0].count = sf->nleaves; 168*eb02082bSJunchao Zhang p[0].idx = NULL; 169*eb02082bSJunchao Zhang p[0].opt = NULL; 170*eb02082bSJunchao Zhang p[0].buf = link->leafbuf[link->leafmtype]; 171*eb02082bSJunchao Zhang p[0].atomic = PETSC_FALSE; 172*eb02082bSJunchao Zhang p[1].count = 0; 173*eb02082bSJunchao Zhang } 174*eb02082bSJunchao Zhang 175*eb02082bSJunchao Zhang ierr = PetscSFPackGetPack(link,link->leafmtype,&Pack);CHKERRQ(ierr); 176*eb02082bSJunchao Zhang for (i=0; i<2; i++) {if (p[i].count) {ierr = (*Pack)(p[i].count,p[i].idx,link,p[i].opt,leafdata,p[i].buf);CHKERRQ(ierr);}} 177*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 178*eb02082bSJunchao Zhang if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);} 179*eb02082bSJunchao Zhang #endif 180b23bfdefSJunchao Zhang PetscFunctionReturn(0); 181b23bfdefSJunchao Zhang } 182b23bfdefSJunchao Zhang 183b23bfdefSJunchao Zhang /* Utility routine to unpack data from root buffer and Op it into selected entries of rootdata */ 184*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,void *rootdata,MPI_Op op,PetscBool sparse) 185b23bfdefSJunchao Zhang { 186b23bfdefSJunchao Zhang PetscErrorCode ierr; 187b23bfdefSJunchao Zhang PetscInt i; 188b23bfdefSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 189*eb02082bSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 190*eb02082bSJunchao Zhang PackInfo p[2]; 191b23bfdefSJunchao Zhang 192b23bfdefSJunchao Zhang PetscFunctionBegin; 193*eb02082bSJunchao Zhang if (sparse) { 194*eb02082bSJunchao Zhang p[0].count = bas->ioffset[bas->ndiranks]; p[1].count = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks]; 195*eb02082bSJunchao Zhang p[0].idx = rootloc; p[1].idx = rootloc+bas->ioffset[bas->ndiranks]; 196*eb02082bSJunchao Zhang p[0].opt = bas->selfrootpackopt; p[1].opt = bas->rootpackopt; 197*eb02082bSJunchao Zhang p[0].buf = link->selfbuf[link->rootmtype]; p[1].buf = link->rootbuf[link->rootmtype]; 198*eb02082bSJunchao Zhang p[0].atomic = bas->selfrootdups; p[1].atomic = bas->remoterootdups; 199*eb02082bSJunchao Zhang } else { 200*eb02082bSJunchao Zhang p[0].count = sf->nroots; 201*eb02082bSJunchao Zhang p[0].idx = NULL; 202*eb02082bSJunchao Zhang p[0].opt = NULL; 203*eb02082bSJunchao Zhang p[0].buf = link->rootbuf[link->rootmtype]; 204*eb02082bSJunchao Zhang p[0].atomic = PETSC_FALSE; 205*eb02082bSJunchao Zhang p[1].count = 0; 206*eb02082bSJunchao Zhang } 207*eb02082bSJunchao Zhang 208*eb02082bSJunchao Zhang for (i=0; i<2; i++) { 209*eb02082bSJunchao Zhang if (p[i].count) { 210*eb02082bSJunchao Zhang ierr = PetscSFPackGetUnpackAndOp(link,link->rootmtype,op,p[i].atomic,&UnpackAndOp);CHKERRQ(ierr); 211*eb02082bSJunchao Zhang if (UnpackAndOp) {ierr = (*UnpackAndOp)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);} 212*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 213*eb02082bSJunchao Zhang else { 214*eb02082bSJunchao Zhang PetscInt j; 215*eb02082bSJunchao Zhang PetscMPIInt n; 216*eb02082bSJunchao Zhang if (p[i].idx) { 217*eb02082bSJunchao Zhang /* Note if done on GPU, this can be very slow due to the huge number of kernel calls. The op is likely user-defined. We must 218*eb02082bSJunchao Zhang use link->unit (instead of link->basicunit) as the datatype and 1 (instead of link->bs) as the count in MPI_Reduce_local. 219*eb02082bSJunchao Zhang */ 220*eb02082bSJunchao Zhang for (j=0; j<p[i].count; j++) {ierr = MPI_Reduce_local(p[i].buf+j*link->unitbytes,(char *)rootdata+p[i].idx[j]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 221*eb02082bSJunchao Zhang } else { 222*eb02082bSJunchao Zhang ierr = PetscMPIIntCast(p[i].count,&n);CHKERRQ(ierr); 223*eb02082bSJunchao Zhang ierr = MPI_Reduce_local(p[i].buf,rootdata,n,link->unit,op);CHKERRQ(ierr); 224*eb02082bSJunchao Zhang } 225*eb02082bSJunchao Zhang } 226*eb02082bSJunchao Zhang #else 227*eb02082bSJunchao Zhang else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 228*eb02082bSJunchao Zhang #endif 229*eb02082bSJunchao Zhang } 230*eb02082bSJunchao Zhang } 231*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 232*eb02082bSJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);} 233*eb02082bSJunchao Zhang #endif 234b23bfdefSJunchao Zhang PetscFunctionReturn(0); 235b23bfdefSJunchao Zhang } 236b23bfdefSJunchao Zhang 237*eb02082bSJunchao Zhang /* Utility routine to unpack data from leaf buffer and Op it into selected entries of leafdata */ 238*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpLeafData(PetscSF sf,PetscSFPack link,const PetscInt *leafloc,void *leafdata,MPI_Op op,PetscBool sparse) 239*eb02082bSJunchao Zhang { 240*eb02082bSJunchao Zhang PetscErrorCode ierr; 241*eb02082bSJunchao Zhang PetscInt i; 242*eb02082bSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 243*eb02082bSJunchao Zhang PackInfo p[2]; 244b23bfdefSJunchao Zhang 245*eb02082bSJunchao Zhang PetscFunctionBegin; 246*eb02082bSJunchao Zhang if (sparse) { 247*eb02082bSJunchao Zhang p[0].count = sf->roffset[sf->ndranks]; p[1].count = sf->roffset[sf->nranks]-sf->roffset[sf->ndranks]; 248*eb02082bSJunchao Zhang p[0].idx = leafloc; p[1].idx = leafloc+sf->roffset[sf->ndranks]; 249*eb02082bSJunchao Zhang p[0].opt = sf->selfleafpackopt; p[1].opt = sf->leafpackopt; 250*eb02082bSJunchao Zhang p[0].buf = link->selfbuf[link->leafmtype]; p[1].buf = link->leafbuf[link->leafmtype]; 251*eb02082bSJunchao Zhang p[0].atomic = sf->selfleafdups; p[1].atomic = sf->remoteleafdups; 252*eb02082bSJunchao Zhang } else { 253*eb02082bSJunchao Zhang p[0].count = sf->nleaves; 254*eb02082bSJunchao Zhang p[0].idx = NULL; 255*eb02082bSJunchao Zhang p[0].opt = NULL; 256*eb02082bSJunchao Zhang p[0].buf = link->leafbuf[link->leafmtype]; 257*eb02082bSJunchao Zhang p[0].atomic = PETSC_FALSE; 258*eb02082bSJunchao Zhang p[1].count = 0; 259*eb02082bSJunchao Zhang } 260*eb02082bSJunchao Zhang 261*eb02082bSJunchao Zhang for (i=0; i<2; i++) { 262*eb02082bSJunchao Zhang if (p[i].count) { 263*eb02082bSJunchao Zhang ierr = PetscSFPackGetUnpackAndOp(link,link->leafmtype,op,p[i].atomic,&UnpackAndOp);CHKERRQ(ierr); 264*eb02082bSJunchao Zhang if (UnpackAndOp) {ierr = (*UnpackAndOp)(p[i].count,p[i].idx,link,p[i].opt,leafdata,p[i].buf);CHKERRQ(ierr);} 265*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 266*eb02082bSJunchao Zhang else { 267*eb02082bSJunchao Zhang PetscInt j; 268*eb02082bSJunchao Zhang PetscMPIInt n; 269*eb02082bSJunchao Zhang if (p[i].idx) { 270*eb02082bSJunchao Zhang for (j=0; j<p[i].count; j++) {ierr = MPI_Reduce_local(p[i].buf+j*link->unitbytes,(char *)leafdata+p[i].idx[j]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 271*eb02082bSJunchao Zhang } else { 272*eb02082bSJunchao Zhang ierr = PetscMPIIntCast(p[i].count,&n);CHKERRQ(ierr); 273*eb02082bSJunchao Zhang ierr = MPI_Reduce_local(p[i].buf,leafdata,n,link->unit,op);CHKERRQ(ierr); 274*eb02082bSJunchao Zhang } 275*eb02082bSJunchao Zhang } 276*eb02082bSJunchao Zhang #else 277*eb02082bSJunchao Zhang else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 278*eb02082bSJunchao Zhang #endif 279*eb02082bSJunchao Zhang } 280*eb02082bSJunchao Zhang } 281*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 282*eb02082bSJunchao Zhang if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);} 283*eb02082bSJunchao Zhang #endif 284*eb02082bSJunchao Zhang PetscFunctionReturn(0); 285*eb02082bSJunchao Zhang } 286*eb02082bSJunchao Zhang 287*eb02082bSJunchao Zhang /* Utility routine to fetch and Op selected entries of rootdata */ 288*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFFetchAndOpRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,void *rootdata,MPI_Op op,PetscBool sparse) 289*eb02082bSJunchao Zhang { 290*eb02082bSJunchao Zhang PetscErrorCode ierr; 291*eb02082bSJunchao Zhang PetscInt i; 292*eb02082bSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 293*eb02082bSJunchao Zhang PetscErrorCode (*FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*); 294*eb02082bSJunchao Zhang PackInfo p[2]; 295*eb02082bSJunchao Zhang 296*eb02082bSJunchao Zhang PetscFunctionBegin; 297*eb02082bSJunchao Zhang if (sparse) { 298*eb02082bSJunchao Zhang /* For SFBasic and SFNeighbor, whose root space is sparse and have separate buffers for self and remote. */ 299*eb02082bSJunchao Zhang p[0].count = bas->ioffset[bas->ndiranks]; p[1].count = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks]; 300*eb02082bSJunchao Zhang p[0].idx = rootloc; p[1].idx = rootloc+bas->ioffset[bas->ndiranks]; 301*eb02082bSJunchao Zhang p[0].opt = bas->selfrootpackopt; p[1].opt = bas->rootpackopt; 302*eb02082bSJunchao Zhang p[0].buf = link->selfbuf[link->rootmtype]; p[1].buf = link->rootbuf[link->rootmtype]; 303*eb02082bSJunchao Zhang p[0].atomic = bas->selfrootdups; p[1].atomic = bas->remoterootdups; 304*eb02082bSJunchao Zhang } else { 305*eb02082bSJunchao Zhang p[0].count = sf->nroots; 306*eb02082bSJunchao Zhang p[0].idx = NULL; 307*eb02082bSJunchao Zhang p[0].opt = NULL; 308*eb02082bSJunchao Zhang p[0].buf = link->rootbuf[link->rootmtype]; 309*eb02082bSJunchao Zhang p[0].atomic = PETSC_FALSE; 310*eb02082bSJunchao Zhang p[1].count = 0; 311*eb02082bSJunchao Zhang } 312*eb02082bSJunchao Zhang 313*eb02082bSJunchao Zhang for (i=0; i<2; i++) { 314*eb02082bSJunchao Zhang if (p[i].count) { 315*eb02082bSJunchao Zhang ierr = PetscSFPackGetFetchAndOp(link,link->rootmtype,op,p[i].atomic,&FetchAndOp);CHKERRQ(ierr); 316*eb02082bSJunchao Zhang ierr = (*FetchAndOp)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr); 317*eb02082bSJunchao Zhang } 318*eb02082bSJunchao Zhang } 319*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 320*eb02082bSJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);} 321*eb02082bSJunchao Zhang #endif 322*eb02082bSJunchao Zhang PetscFunctionReturn(0); 323*eb02082bSJunchao Zhang } 324*eb02082bSJunchao Zhang 325*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackSetupOptimizations_Basic(PetscSF sf) 326b23bfdefSJunchao Zhang { 327b23bfdefSJunchao Zhang PetscErrorCode ierr; 328b23bfdefSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 329b23bfdefSJunchao Zhang 330b23bfdefSJunchao Zhang PetscFunctionBegin; 331*eb02082bSJunchao Zhang ierr = PetscSFPackOptCreate(sf->ndranks, sf->roffset, sf->rmine, &sf->selfleafpackopt);CHKERRQ(ierr); 332*eb02082bSJunchao Zhang ierr = PetscSFPackOptCreate(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt);CHKERRQ(ierr); 333*eb02082bSJunchao Zhang ierr = PetscSFPackOptCreate(bas->ndiranks, bas->ioffset, bas->irootloc,&bas->selfrootpackopt);CHKERRQ(ierr); 334*eb02082bSJunchao Zhang ierr = PetscSFPackOptCreate(bas->niranks-bas->ndiranks,bas->ioffset+bas->ndiranks,bas->irootloc,&bas->rootpackopt);CHKERRQ(ierr); 335*eb02082bSJunchao Zhang 336*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 337*eb02082bSJunchao Zhang /* Check duplicates in irootloc[] so CUDA packing kernels can use cheaper regular operations 338*eb02082bSJunchao Zhang instead of atomics to unpack data on leaves/roots, when they know there is not data race. 339*eb02082bSJunchao Zhang */ 340*eb02082bSJunchao Zhang ierr = PetscCheckDupsInt(sf->roffset[sf->ndranks], sf->rmine, &sf->selfleafdups);CHKERRQ(ierr); 341*eb02082bSJunchao Zhang ierr = PetscCheckDupsInt(sf->roffset[sf->nranks]-sf->roffset[sf->ndranks], sf->rmine+sf->roffset[sf->ndranks], &sf->remoteleafdups);CHKERRQ(ierr); 342*eb02082bSJunchao Zhang ierr = PetscCheckDupsInt(bas->ioffset[bas->ndiranks], bas->irootloc, &bas->selfrootdups);CHKERRQ(ierr); 343*eb02082bSJunchao Zhang ierr = PetscCheckDupsInt(bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks],bas->irootloc+bas->ioffset[bas->ndiranks],&bas->remoterootdups);CHKERRQ(ierr); 344*eb02082bSJunchao Zhang #endif 345b23bfdefSJunchao Zhang PetscFunctionReturn(0); 346b23bfdefSJunchao Zhang } 347b23bfdefSJunchao Zhang 348*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackDestroyOptimizations_Basic(PetscSF sf) 349b23bfdefSJunchao Zhang { 350b23bfdefSJunchao Zhang PetscErrorCode ierr; 351b23bfdefSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 352b23bfdefSJunchao Zhang 353b23bfdefSJunchao Zhang PetscFunctionBegin; 354*eb02082bSJunchao Zhang ierr = PetscSFPackOptDestory(&sf->leafpackopt);CHKERRQ(ierr); 355*eb02082bSJunchao Zhang ierr = PetscSFPackOptDestory(&sf->selfleafpackopt);CHKERRQ(ierr); 356*eb02082bSJunchao Zhang ierr = PetscSFPackOptDestory(&bas->rootpackopt);CHKERRQ(ierr); 357*eb02082bSJunchao Zhang ierr = PetscSFPackOptDestory(&bas->selfrootpackopt);CHKERRQ(ierr); 358*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 359*eb02082bSJunchao Zhang sf->selfleafdups = PETSC_TRUE; 360*eb02082bSJunchao Zhang sf->remoteleafdups = PETSC_TRUE; 361*eb02082bSJunchao Zhang bas->selfrootdups = PETSC_TRUE; /* The default is assuming there are dups so that atomics are used. */ 362*eb02082bSJunchao Zhang bas->remoterootdups = PETSC_TRUE; 363*eb02082bSJunchao Zhang #endif 364b23bfdefSJunchao Zhang PetscFunctionReturn(0); 365b23bfdefSJunchao Zhang } 366b23bfdefSJunchao Zhang 367*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackWaitall_Basic(PetscSFPack link,PetscSFDirection direction) 36840e23c03SJunchao Zhang { 36940e23c03SJunchao Zhang PetscErrorCode ierr; 37040e23c03SJunchao Zhang 37140e23c03SJunchao Zhang PetscFunctionBegin; 372*eb02082bSJunchao Zhang ierr = MPI_Waitall(link->nrootreqs,link->rootreqs[direction][link->rootmtype],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 373*eb02082bSJunchao Zhang ierr = MPI_Waitall(link->nleafreqs,link->leafreqs[direction][link->leafmtype],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 37440e23c03SJunchao Zhang PetscFunctionReturn(0); 37540e23c03SJunchao Zhang } 37640e23c03SJunchao Zhang 37740e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUp_Basic(PetscSF); 37840e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFView_Basic(PetscSF,PetscViewer); 37940e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFReset_Basic(PetscSF); 38040e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFDestroy_Basic(PetscSF); 381*eb02082bSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFBcastAndOpEnd_Basic (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType, void*, MPI_Op); 382*eb02082bSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFReduceEnd_Basic (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType, void*, MPI_Op); 383*eb02082bSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFFetchAndOpBegin_Basic(PetscSF,MPI_Datatype,PetscMemType, void*,PetscMemType,const void*,void*,MPI_Op); 384f659e5c7SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*); 385f659e5c7SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedLeafSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*); 38640e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFGetLeafRanks_Basic(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**); 387*eb02082bSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackGet_Basic_Common(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,PetscInt,PetscInt,PetscSFPack*); 38840e23c03SJunchao Zhang #endif 389