140e23c03SJunchao Zhang #if !defined(__SFBASIC_H) 240e23c03SJunchao Zhang #define __SFBASIC_H 340e23c03SJunchao Zhang 440e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 540e23c03SJunchao Zhang 640e23c03SJunchao Zhang typedef enum {PETSCSF_LEAF2ROOT_REDUCE=0, PETSCSF_ROOT2LEAF_BCAST=1} PetscSFDirection; 740e23c03SJunchao Zhang 840e23c03SJunchao Zhang typedef struct _n_PetscSFPack_Basic *PetscSFPack_Basic; 940e23c03SJunchao Zhang 10*b23bfdefSJunchao Zhang 11*b23bfdefSJunchao Zhang /* Why do we want to double MPI requests? 12*b23bfdefSJunchao Zhang Note each PetscSFPack link supports either leaf2root or root2leaf communication, but not simultaneously both. 13*b23bfdefSJunchao Zhang We use persistent MPI requests in SFBasic. By doubling the requests, the communications in both direction can 14*b23bfdefSJunchao Zhang shared rootbuf and leafbuf. SFNeighbor etc do not need this since MPI does not support persistent requests for 15*b23bfdefSJunchao Zhang collectives yet. But once MPI adds this feature, SFNeighbor etc can also benefit from this design. 16*b23bfdefSJunchao Zhang */ 1740e23c03SJunchao Zhang #define SPPACKBASICHEADER \ 1840e23c03SJunchao Zhang SFPACKHEADER; \ 1940e23c03SJunchao Zhang PetscMPIInt half; /* Number of MPI_Requests used for either leaf2root or root2leaf communication */ \ 2040e23c03SJunchao Zhang MPI_Request *requests /* [2*half] requests arranged in this order: leaf2root root/leaf reqs, root2leaf root/leaf reqs */ 2140e23c03SJunchao Zhang 2240e23c03SJunchao Zhang struct _n_PetscSFPack_Basic { 2340e23c03SJunchao Zhang SPPACKBASICHEADER; 2440e23c03SJunchao Zhang PetscBool initialized[2]; /* Is the communcation pattern in each direction initialized? [0] for leaf2root, [1] for root2leaf */ 2540e23c03SJunchao Zhang }; 2640e23c03SJunchao Zhang 2740e23c03SJunchao Zhang #define SFBASICHEADER \ 2840e23c03SJunchao Zhang PetscMPIInt niranks; /* Number of incoming ranks (ranks accessing my roots) */ \ 2940e23c03SJunchao Zhang PetscMPIInt ndiranks; /* Number of incoming ranks (ranks accessing my roots) in distinguished set */ \ 3040e23c03SJunchao Zhang PetscMPIInt *iranks; /* Array of ranks that reference my roots */ \ 3140e23c03SJunchao Zhang PetscInt itotal; /* Total number of graph edges referencing my roots */ \ 3240e23c03SJunchao Zhang PetscInt *ioffset; /* Array of length niranks+1 holding offset in irootloc[] for each rank */ \ 3340e23c03SJunchao Zhang PetscInt *irootloc; /* Incoming roots referenced by ranks starting at ioffset[rank] */ \ 3440e23c03SJunchao Zhang PetscSFPackOpt rootpackopt; /* Optimization plans to (un)pack roots based on patterns in irootloc[]. NULL for no plans */ \ 35*b23bfdefSJunchao Zhang PetscSFPackOpt selfrootpackopt; /* Optimization plans to (un)pack roots connected to local leaves */ \ 3640e23c03SJunchao Zhang PetscSFPack avail; /* One or more entries per MPI Datatype, lazily constructed */ \ 3740e23c03SJunchao Zhang PetscSFPack inuse /* Buffers being used for transactions that have not yet completed */ 3840e23c03SJunchao Zhang 3940e23c03SJunchao Zhang typedef struct { 4040e23c03SJunchao Zhang SFBASICHEADER; 4140e23c03SJunchao Zhang } PetscSF_Basic; 4240e23c03SJunchao Zhang 4340e23c03SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootInfo_Basic(PetscSF sf,PetscInt *nrootranks,PetscInt *ndrootranks,const PetscMPIInt **rootranks,const PetscInt **rootoffset,const PetscInt **rootloc) 4440e23c03SJunchao Zhang { 4540e23c03SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 4640e23c03SJunchao Zhang 4740e23c03SJunchao Zhang PetscFunctionBegin; 4840e23c03SJunchao Zhang if (nrootranks) *nrootranks = bas->niranks; 4940e23c03SJunchao Zhang if (ndrootranks) *ndrootranks = bas->ndiranks; 5040e23c03SJunchao Zhang if (rootranks) *rootranks = bas->iranks; 5140e23c03SJunchao Zhang if (rootoffset) *rootoffset = bas->ioffset; 5240e23c03SJunchao Zhang if (rootloc) *rootloc = bas->irootloc; 5340e23c03SJunchao Zhang PetscFunctionReturn(0); 5440e23c03SJunchao Zhang } 5540e23c03SJunchao Zhang 5640e23c03SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafInfo_Basic(PetscSF sf,PetscInt *nleafranks,PetscInt *ndleafranks,const PetscMPIInt **leafranks,const PetscInt **leafoffset,const PetscInt **leafloc,const PetscInt **leafrremote) 5740e23c03SJunchao Zhang { 5840e23c03SJunchao Zhang PetscFunctionBegin; 5940e23c03SJunchao Zhang if (nleafranks) *nleafranks = sf->nranks; 6040e23c03SJunchao Zhang if (ndleafranks) *ndleafranks = sf->ndranks; 6140e23c03SJunchao Zhang if (leafranks) *leafranks = sf->ranks; 6240e23c03SJunchao Zhang if (leafoffset) *leafoffset = sf->roffset; 6340e23c03SJunchao Zhang if (leafloc) *leafloc = sf->rmine; 6440e23c03SJunchao Zhang if (leafrremote) *leafrremote = sf->rremote; 6540e23c03SJunchao Zhang PetscFunctionReturn(0); 6640e23c03SJunchao Zhang } 6740e23c03SJunchao Zhang 68*b23bfdefSJunchao Zhang /* Get root locations either on Host (CPU) or Device (GPU) */ 69*b23bfdefSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootIndicesAtPlace_Basic(PetscSF sf,PetscBool isdevice, const PetscInt **rootloc) 70*b23bfdefSJunchao Zhang { 71*b23bfdefSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 72*b23bfdefSJunchao Zhang PetscFunctionBegin; 73*b23bfdefSJunchao Zhang if (rootloc) *rootloc = bas->irootloc; 74*b23bfdefSJunchao Zhang PetscFunctionReturn(0); 75*b23bfdefSJunchao Zhang } 76*b23bfdefSJunchao Zhang 77*b23bfdefSJunchao Zhang /* Get leaf locations either on Host (CPU) or Device (GPU) */ 78*b23bfdefSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafIndicesAtPlace_Basic(PetscSF sf,PetscBool isdevice, const PetscInt **leafloc) 79*b23bfdefSJunchao Zhang { 80*b23bfdefSJunchao Zhang PetscFunctionBegin; 81*b23bfdefSJunchao Zhang if (leafloc) *leafloc = sf->rmine; 82*b23bfdefSJunchao Zhang PetscFunctionReturn(0); 83*b23bfdefSJunchao Zhang } 84*b23bfdefSJunchao Zhang 85*b23bfdefSJunchao Zhang typedef struct { 86*b23bfdefSJunchao Zhang PetscInt count; /* Number of entries to pack, unpack etc. */ 87*b23bfdefSJunchao Zhang PetscInt offset; /* Offset of the first entry */ 88*b23bfdefSJunchao Zhang PetscSFPackOpt opt; /* Pack optimizations */ 89*b23bfdefSJunchao Zhang char *buf; /* The contiguous buffer where we pack to or unpack from */ 90*b23bfdefSJunchao Zhang } PackInfo; 91*b23bfdefSJunchao Zhang 92*b23bfdefSJunchao Zhang /* Utility routine to pack selected entries of rootdata into root buffer */ 93*b23bfdefSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackRootData(PetscSF sf,PetscSFPack link,PetscInt nrootranks,PetscInt ndrootranks,const PetscInt *rootoffset,const PetscInt *rootloc,const void *rootdata) 94*b23bfdefSJunchao Zhang { 95*b23bfdefSJunchao Zhang PetscErrorCode ierr; 96*b23bfdefSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 97*b23bfdefSJunchao Zhang PetscInt i; 98*b23bfdefSJunchao Zhang PackInfo pinfo[2] = {{rootoffset[ndrootranks], 0, bas->selfrootpackopt, link->selfbuf}, {rootoffset[nrootranks]-rootoffset[ndrootranks], rootoffset[ndrootranks], bas->rootpackopt, link->rootbuf}}; 99*b23bfdefSJunchao Zhang 100*b23bfdefSJunchao Zhang PetscFunctionBegin; 101*b23bfdefSJunchao Zhang /* Only do packing when count != 0 so that we can avoid invoking CUDA kernels on GPU. */ 102*b23bfdefSJunchao Zhang for (i=0; i<2; i++) {if (pinfo[i].count) {ierr = (*link->Pack)(pinfo[i].count,rootloc+pinfo[i].offset,link->bs,pinfo[i].opt,rootdata,pinfo[i].buf);CHKERRQ(ierr);}} 103*b23bfdefSJunchao Zhang PetscFunctionReturn(0); 104*b23bfdefSJunchao Zhang } 105*b23bfdefSJunchao Zhang 106*b23bfdefSJunchao Zhang /* Utility routine to pack selected entries of leafdata into leaf buffer */ 107*b23bfdefSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackLeafData(PetscSF sf,PetscSFPack link,PetscInt nleafranks,PetscInt ndleafranks,const PetscInt *leafoffset,const PetscInt *leafloc,const void *leafdata) 108*b23bfdefSJunchao Zhang { 109*b23bfdefSJunchao Zhang PetscErrorCode ierr; 110*b23bfdefSJunchao Zhang PetscInt i; 111*b23bfdefSJunchao Zhang PackInfo pinfo[2] = {{leafoffset[ndleafranks], 0, sf->selfleafpackopt, link->selfbuf}, {leafoffset[nleafranks]-leafoffset[ndleafranks], leafoffset[ndleafranks], sf->leafpackopt, link->leafbuf}}; 112*b23bfdefSJunchao Zhang 113*b23bfdefSJunchao Zhang PetscFunctionBegin; 114*b23bfdefSJunchao Zhang for (i=0; i<2; i++) {if (pinfo[i].count) {ierr = (*link->Pack)(pinfo[i].count,leafloc+pinfo[i].offset,link->bs,pinfo[i].opt,leafdata,pinfo[i].buf);CHKERRQ(ierr);}} 115*b23bfdefSJunchao Zhang PetscFunctionReturn(0); 116*b23bfdefSJunchao Zhang } 117*b23bfdefSJunchao Zhang 118*b23bfdefSJunchao Zhang /* Utility routine to unpack data from root buffer and Op it into selected entries of rootdata */ 119*b23bfdefSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpRootData(PetscSF sf,PetscSFPack link,PetscInt nrootranks,PetscInt ndrootranks,const PetscInt *rootoffset,const PetscInt *rootloc,void *rootdata,MPI_Op op) 120*b23bfdefSJunchao Zhang { 121*b23bfdefSJunchao Zhang PetscErrorCode ierr; 122*b23bfdefSJunchao Zhang PetscInt i,j; 123*b23bfdefSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 124*b23bfdefSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); 125*b23bfdefSJunchao Zhang PackInfo pinfo[2] = {{rootoffset[ndrootranks], 0, bas->selfrootpackopt, link->selfbuf}, {rootoffset[nrootranks]-rootoffset[ndrootranks], rootoffset[ndrootranks], bas->rootpackopt, link->rootbuf}}; 126*b23bfdefSJunchao Zhang 127*b23bfdefSJunchao Zhang PetscFunctionBegin; 128*b23bfdefSJunchao Zhang ierr = PetscSFPackGetUnpackAndOp(sf,(PetscSFPack)link,op,&UnpackAndOp);CHKERRQ(ierr); 129*b23bfdefSJunchao Zhang for (i=0; i<2; i++) { 130*b23bfdefSJunchao Zhang if (UnpackAndOp && pinfo[i].count) {ierr = (*UnpackAndOp)(pinfo[i].count,rootloc+pinfo[i].offset,link->bs,pinfo[i].opt,rootdata,pinfo[i].buf);CHKERRQ(ierr);} 131*b23bfdefSJunchao Zhang else {for (j=0; j<pinfo[i].count; j++) {ierr = MPI_Reduce_local(pinfo[i].buf+j*link->unitbytes,(char *)rootdata+(rootloc[pinfo[i].offset+j])*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}} 132*b23bfdefSJunchao Zhang } 133*b23bfdefSJunchao Zhang PetscFunctionReturn(0); 134*b23bfdefSJunchao Zhang } 135*b23bfdefSJunchao Zhang 136*b23bfdefSJunchao Zhang /* Utility routine to unpack data from leaf buffer and Op it into selected entries of leafdata */ 137*b23bfdefSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpLeafData(PetscSF sf,PetscSFPack link,PetscInt nleafranks,PetscInt ndleafranks,const PetscInt *leafoffset,const PetscInt *leafloc, void *leafdata,MPI_Op op) 138*b23bfdefSJunchao Zhang { 139*b23bfdefSJunchao Zhang PetscErrorCode ierr; 140*b23bfdefSJunchao Zhang PetscInt i,j; 141*b23bfdefSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); 142*b23bfdefSJunchao Zhang PackInfo pinfo[2] = {{leafoffset[ndleafranks], 0, sf->selfleafpackopt, link->selfbuf}, {leafoffset[nleafranks]-leafoffset[ndleafranks], leafoffset[ndleafranks], sf->leafpackopt, link->leafbuf}}; 143*b23bfdefSJunchao Zhang 144*b23bfdefSJunchao Zhang PetscFunctionBegin; 145*b23bfdefSJunchao Zhang ierr = PetscSFPackGetUnpackAndOp(sf,(PetscSFPack)link,op,&UnpackAndOp);CHKERRQ(ierr); 146*b23bfdefSJunchao Zhang for (i=0; i<2; i++) { 147*b23bfdefSJunchao Zhang if (UnpackAndOp && pinfo[i].count) {ierr = (*UnpackAndOp)(pinfo[i].count,leafloc+pinfo[i].offset,link->bs,pinfo[i].opt,leafdata,pinfo[i].buf);CHKERRQ(ierr);} 148*b23bfdefSJunchao Zhang else {for (j=0; j<pinfo[i].count; j++) {ierr = MPI_Reduce_local(pinfo[i].buf+j*link->unitbytes,(char *)leafdata+(leafloc[pinfo[i].offset+j])*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}} 149*b23bfdefSJunchao Zhang } 150*b23bfdefSJunchao Zhang PetscFunctionReturn(0); 151*b23bfdefSJunchao Zhang } 152*b23bfdefSJunchao Zhang 153*b23bfdefSJunchao Zhang /* Utility routine to fetch and Op selected entries of rootdata */ 154*b23bfdefSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFFetchAndOpRootData(PetscSF sf,PetscSFPack link,PetscInt nrootranks,PetscInt ndrootranks,const PetscInt *rootoffset,const PetscInt *rootloc,void *rootdata,MPI_Op op) 155*b23bfdefSJunchao Zhang { 156*b23bfdefSJunchao Zhang PetscErrorCode ierr; 157*b23bfdefSJunchao Zhang PetscInt i; 158*b23bfdefSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 159*b23bfdefSJunchao Zhang PetscErrorCode (*FetchAndOp)(PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); 160*b23bfdefSJunchao Zhang PackInfo pinfo[2] = {{rootoffset[ndrootranks], 0, bas->selfrootpackopt, link->selfbuf}, {rootoffset[nrootranks]-rootoffset[ndrootranks], rootoffset[ndrootranks], bas->rootpackopt, link->rootbuf}}; 161*b23bfdefSJunchao Zhang 162*b23bfdefSJunchao Zhang PetscFunctionBegin; 163*b23bfdefSJunchao Zhang ierr = PetscSFPackGetFetchAndOp(sf,(PetscSFPack)link,op,&FetchAndOp);CHKERRQ(ierr); 164*b23bfdefSJunchao Zhang for (i=0; i<2; i++) {if (pinfo[i].count) {ierr = (*FetchAndOp)(pinfo[i].count,rootloc+pinfo[i].offset,link->bs,pinfo[i].opt,rootdata,pinfo[i].buf);CHKERRQ(ierr);}} 165*b23bfdefSJunchao Zhang PetscFunctionReturn(0); 166*b23bfdefSJunchao Zhang } 167*b23bfdefSJunchao Zhang 168*b23bfdefSJunchao Zhang 169*b23bfdefSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackSetupOptimization_Basic(PetscSF sf) 170*b23bfdefSJunchao Zhang { 171*b23bfdefSJunchao Zhang PetscErrorCode ierr; 172*b23bfdefSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 173*b23bfdefSJunchao Zhang 174*b23bfdefSJunchao Zhang PetscFunctionBegin; 175*b23bfdefSJunchao Zhang ierr = PetscSFPackSetupOptimization(sf->ndranks, sf->roffset, sf->rmine, &sf->selfleafpackopt);CHKERRQ(ierr); 176*b23bfdefSJunchao Zhang ierr = PetscSFPackSetupOptimization(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt);CHKERRQ(ierr); 177*b23bfdefSJunchao Zhang ierr = PetscSFPackSetupOptimization(bas->ndiranks, bas->ioffset, bas->irootloc,&bas->selfrootpackopt);CHKERRQ(ierr); 178*b23bfdefSJunchao Zhang ierr = PetscSFPackSetupOptimization(bas->niranks-bas->ndiranks,bas->ioffset+bas->ndiranks,bas->irootloc,&bas->rootpackopt);CHKERRQ(ierr); 179*b23bfdefSJunchao Zhang PetscFunctionReturn(0); 180*b23bfdefSJunchao Zhang } 181*b23bfdefSJunchao Zhang 182*b23bfdefSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackDestroyOptimization_Basic(PetscSF sf) 183*b23bfdefSJunchao Zhang { 184*b23bfdefSJunchao Zhang PetscErrorCode ierr; 185*b23bfdefSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 186*b23bfdefSJunchao Zhang 187*b23bfdefSJunchao Zhang PetscFunctionBegin; 188*b23bfdefSJunchao Zhang ierr = PetscSFPackDestoryOptimization(&sf->leafpackopt);CHKERRQ(ierr); 189*b23bfdefSJunchao Zhang ierr = PetscSFPackDestoryOptimization(&sf->selfleafpackopt);CHKERRQ(ierr); 190*b23bfdefSJunchao Zhang ierr = PetscSFPackDestoryOptimization(&bas->rootpackopt);CHKERRQ(ierr); 191*b23bfdefSJunchao Zhang ierr = PetscSFPackDestoryOptimization(&bas->selfrootpackopt);CHKERRQ(ierr); 192*b23bfdefSJunchao Zhang PetscFunctionReturn(0); 193*b23bfdefSJunchao Zhang } 194*b23bfdefSJunchao Zhang 19540e23c03SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackWaitall_Basic(PetscSFPack_Basic link,PetscSFDirection direction) 19640e23c03SJunchao Zhang { 19740e23c03SJunchao Zhang PetscErrorCode ierr; 19840e23c03SJunchao Zhang MPI_Request *requests = (direction == PETSCSF_LEAF2ROOT_REDUCE) ? link->requests : link->requests + link->half; 19940e23c03SJunchao Zhang 20040e23c03SJunchao Zhang PetscFunctionBegin; 20140e23c03SJunchao Zhang ierr = MPI_Waitall(link->half,requests,MPI_STATUSES_IGNORE);CHKERRQ(ierr); 20240e23c03SJunchao Zhang PetscFunctionReturn(0); 20340e23c03SJunchao Zhang } 20440e23c03SJunchao Zhang 20540e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUp_Basic(PetscSF); 20640e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFView_Basic(PetscSF,PetscViewer); 20740e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFReset_Basic(PetscSF); 20840e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFDestroy_Basic(PetscSF); 20940e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFBcastAndOpEnd_Basic(PetscSF,MPI_Datatype,const void*,void*,MPI_Op); 21040e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFReduceEnd_Basic(PetscSF,MPI_Datatype,const void*,void*,MPI_Op); 21140e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFFetchAndOpBegin_Basic(PetscSF,MPI_Datatype,void*,const void*,void*,MPI_Op); 212f659e5c7SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*); 213f659e5c7SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedLeafSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*); 21440e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFGetLeafRanks_Basic(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**); 215*b23bfdefSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackGet_Basic_Common(PetscSF,MPI_Datatype,const void*,const void*,PetscInt,PetscSFPack_Basic*); 21640e23c03SJunchao Zhang #endif 217