xref: /petsc/src/vec/is/sf/impls/basic/sfbasic.h (revision eb02082b21b627ae704d1b64176a7a4b8db6ed4b)
140e23c03SJunchao Zhang #if !defined(__SFBASIC_H)
240e23c03SJunchao Zhang #define __SFBASIC_H
340e23c03SJunchao Zhang 
440e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
540e23c03SJunchao Zhang 
640e23c03SJunchao Zhang typedef enum {PETSCSF_LEAF2ROOT_REDUCE=0, PETSCSF_ROOT2LEAF_BCAST=1} PetscSFDirection;
740e23c03SJunchao Zhang 
840e23c03SJunchao Zhang #define SFBASICHEADER \
940e23c03SJunchao Zhang   PetscMPIInt      niranks;         /* Number of incoming ranks (ranks accessing my roots) */                                      \
1040e23c03SJunchao Zhang   PetscMPIInt      ndiranks;        /* Number of incoming ranks (ranks accessing my roots) in distinguished set */                 \
1140e23c03SJunchao Zhang   PetscMPIInt      *iranks;         /* Array of ranks that reference my roots */                                                   \
1240e23c03SJunchao Zhang   PetscInt         itotal;          /* Total number of graph edges referencing my roots */                                         \
1340e23c03SJunchao Zhang   PetscInt         *ioffset;        /* Array of length niranks+1 holding offset in irootloc[] for each rank */                     \
1440e23c03SJunchao Zhang   PetscInt         *irootloc;       /* Incoming roots referenced by ranks starting at ioffset[rank] */                             \
15*eb02082bSJunchao Zhang   PetscInt         *irootloc_d;     /* irootloc in device memory when needed */                                                    \
1640e23c03SJunchao Zhang   PetscSFPackOpt   rootpackopt;     /* Optimization plans to (un)pack roots based on patterns in irootloc[]. NULL for no plans */  \
17b23bfdefSJunchao Zhang   PetscSFPackOpt   selfrootpackopt; /* Optimization plans to (un)pack roots connected to local leaves */                           \
1840e23c03SJunchao Zhang   PetscSFPack      avail;           /* One or more entries per MPI Datatype, lazily constructed */                                 \
19*eb02082bSJunchao Zhang   PetscSFPack      inuse;           /* Buffers being used for transactions that have not yet completed */                          \
20*eb02082bSJunchao Zhang   PetscBool        selfrootdups;    /* Indices of roots in irootloc[0,ioffset[ndiranks]) have dups, implying theads working ... */ \
21*eb02082bSJunchao Zhang                                     /* ... on these roots in parallel may have data race. */                                       \
22*eb02082bSJunchao Zhang   PetscBool        remoterootdups   /* Indices of roots in irootloc[ioffset[ndiranks],ioffset[niranks]) have dups */
2340e23c03SJunchao Zhang 
2440e23c03SJunchao Zhang typedef struct {
2540e23c03SJunchao Zhang   SFBASICHEADER;
2640e23c03SJunchao Zhang } PetscSF_Basic;
2740e23c03SJunchao Zhang 
2840e23c03SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootInfo_Basic(PetscSF sf,PetscInt *nrootranks,PetscInt *ndrootranks,const PetscMPIInt **rootranks,const PetscInt **rootoffset,const PetscInt **rootloc)
2940e23c03SJunchao Zhang {
3040e23c03SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
3140e23c03SJunchao Zhang 
3240e23c03SJunchao Zhang   PetscFunctionBegin;
3340e23c03SJunchao Zhang   if (nrootranks)  *nrootranks  = bas->niranks;
3440e23c03SJunchao Zhang   if (ndrootranks) *ndrootranks = bas->ndiranks;
3540e23c03SJunchao Zhang   if (rootranks)   *rootranks   = bas->iranks;
3640e23c03SJunchao Zhang   if (rootoffset)  *rootoffset  = bas->ioffset;
3740e23c03SJunchao Zhang   if (rootloc)     *rootloc     = bas->irootloc;
3840e23c03SJunchao Zhang   PetscFunctionReturn(0);
3940e23c03SJunchao Zhang }
4040e23c03SJunchao Zhang 
4140e23c03SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafInfo_Basic(PetscSF sf,PetscInt *nleafranks,PetscInt *ndleafranks,const PetscMPIInt **leafranks,const PetscInt **leafoffset,const PetscInt **leafloc,const PetscInt **leafrremote)
4240e23c03SJunchao Zhang {
4340e23c03SJunchao Zhang   PetscFunctionBegin;
4440e23c03SJunchao Zhang   if (nleafranks)  *nleafranks  = sf->nranks;
4540e23c03SJunchao Zhang   if (ndleafranks) *ndleafranks = sf->ndranks;
4640e23c03SJunchao Zhang   if (leafranks)   *leafranks   = sf->ranks;
4740e23c03SJunchao Zhang   if (leafoffset)  *leafoffset  = sf->roffset;
4840e23c03SJunchao Zhang   if (leafloc)     *leafloc     = sf->rmine;
4940e23c03SJunchao Zhang   if (leafrremote) *leafrremote = sf->rremote;
5040e23c03SJunchao Zhang   PetscFunctionReturn(0);
5140e23c03SJunchao Zhang }
5240e23c03SJunchao Zhang 
53*eb02082bSJunchao Zhang /* Get root locations either on Host or Device */
54*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFGetRootIndicesWithMemType_Basic(PetscSF sf,PetscMemType mtype, const PetscInt **rootloc)
55b23bfdefSJunchao Zhang {
56b23bfdefSJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
57b23bfdefSJunchao Zhang   PetscFunctionBegin;
58*eb02082bSJunchao Zhang   if (rootloc) {
59*eb02082bSJunchao Zhang     if (mtype == PETSC_MEMTYPE_HOST) *rootloc = bas->irootloc;
60*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
61*eb02082bSJunchao Zhang     else if (mtype == PETSC_MEMTYPE_DEVICE) {
62*eb02082bSJunchao Zhang       if (!bas->irootloc_d) {
63*eb02082bSJunchao Zhang         cudaError_t err;
64*eb02082bSJunchao Zhang         size_t      size = bas->ioffset[bas->niranks]*sizeof(PetscInt);
65*eb02082bSJunchao Zhang         err = cudaMalloc((void **)&bas->irootloc_d,size);CHKERRCUDA(err);
66*eb02082bSJunchao Zhang         err = cudaMemcpy(bas->irootloc_d,bas->irootloc,size,cudaMemcpyHostToDevice);CHKERRCUDA(err);
67*eb02082bSJunchao Zhang       }
68*eb02082bSJunchao Zhang       *rootloc = bas->irootloc_d;
69*eb02082bSJunchao Zhang     }
70*eb02082bSJunchao Zhang #endif
71*eb02082bSJunchao Zhang     else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType in getting rootloc %d",(int)mtype);
72*eb02082bSJunchao Zhang   }
73b23bfdefSJunchao Zhang   PetscFunctionReturn(0);
74b23bfdefSJunchao Zhang }
75b23bfdefSJunchao Zhang 
76b23bfdefSJunchao Zhang /* Get leaf locations either on Host (CPU) or Device (GPU) */
77*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFGetLeafIndicesWithMemType_Basic(PetscSF sf,PetscMemType mtype, const PetscInt **leafloc)
78b23bfdefSJunchao Zhang {
79b23bfdefSJunchao Zhang   PetscFunctionBegin;
80*eb02082bSJunchao Zhang   if (leafloc) {
81*eb02082bSJunchao Zhang     if (mtype == PETSC_MEMTYPE_HOST) *leafloc = sf->rmine;
82*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
83*eb02082bSJunchao Zhang     else if (mtype == PETSC_MEMTYPE_DEVICE) {
84*eb02082bSJunchao Zhang       if (!sf->rmine_d) {
85*eb02082bSJunchao Zhang         cudaError_t err;
86*eb02082bSJunchao Zhang         size_t      size = sf->roffset[sf->nranks]*sizeof(PetscInt);
87*eb02082bSJunchao Zhang         err = cudaMalloc((void **)&sf->rmine_d,size);CHKERRCUDA(err);
88*eb02082bSJunchao Zhang         err = cudaMemcpy(sf->rmine_d,sf->rmine,size,cudaMemcpyHostToDevice);CHKERRCUDA(err);
89*eb02082bSJunchao Zhang       }
90*eb02082bSJunchao Zhang       *leafloc = sf->rmine_d;
91*eb02082bSJunchao Zhang     }
92*eb02082bSJunchao Zhang #endif
93*eb02082bSJunchao Zhang     else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType in getting leafloc %d",(int)mtype);
94*eb02082bSJunchao Zhang   }
95b23bfdefSJunchao Zhang   PetscFunctionReturn(0);
96b23bfdefSJunchao Zhang }
97b23bfdefSJunchao Zhang 
98*eb02082bSJunchao Zhang /* A convenience struct to provide info to the following (un)packing routines so that we can treat packings to self and to remote in one loop. */
99*eb02082bSJunchao Zhang typedef struct _n_PackInfo {
100b23bfdefSJunchao Zhang   PetscInt       count;  /* Number of entries to pack, unpack etc. */
101*eb02082bSJunchao Zhang   const PetscInt *idx;   /* Indices of the entries. NULL for contiguous indices [0,count-1) */
102b23bfdefSJunchao Zhang   PetscSFPackOpt opt;    /* Pack optimizations */
103b23bfdefSJunchao Zhang   char           *buf;   /* The contiguous buffer where we pack to or unpack from */
104*eb02082bSJunchao Zhang   PetscBool      atomic; /* Whether the unpack routine needs to use atomics */
105b23bfdefSJunchao Zhang } PackInfo;
106b23bfdefSJunchao Zhang 
107*eb02082bSJunchao Zhang /* Utility routine to pack selected entries of rootdata into root buffer.
108*eb02082bSJunchao Zhang   Input Arguments:
109*eb02082bSJunchao Zhang   + sf       - The SF this packing works on.
110*eb02082bSJunchao Zhang   . link     - The PetscSFPack, which gives the memtype of the roots and also provides root buffer.
111*eb02082bSJunchao Zhang   . rootloc  - Indices of the roots, only meaningful if the root space is sparse
112*eb02082bSJunchao Zhang   . rootdata - Where to read the roots.
113*eb02082bSJunchao Zhang   - sparse   - Is the root space sparse (for SFBasic, SFNeighbor)  or dense (for SFAllgatherv etc)
114*eb02082bSJunchao Zhang  */
115*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,const void *rootdata,PetscBool sparse)
116b23bfdefSJunchao Zhang {
117b23bfdefSJunchao Zhang   PetscErrorCode ierr;
118b23bfdefSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
119b23bfdefSJunchao Zhang   PetscInt       i;
120*eb02082bSJunchao Zhang   PetscErrorCode (*Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*);
121*eb02082bSJunchao Zhang   PackInfo       p[2];
122b23bfdefSJunchao Zhang 
123b23bfdefSJunchao Zhang   PetscFunctionBegin;
124*eb02082bSJunchao Zhang   if (sparse) {
125*eb02082bSJunchao Zhang     /* For SFBasic and SFNeighbor, whose root space is sparse and have separate buffers for self and remote. */
126*eb02082bSJunchao Zhang     p[0].count  = bas->ioffset[bas->ndiranks];    p[1].count  = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks];
127*eb02082bSJunchao Zhang     p[0].idx    = rootloc;                        p[1].idx    = rootloc+bas->ioffset[bas->ndiranks];
128*eb02082bSJunchao Zhang     p[0].opt    = bas->selfrootpackopt;           p[1].opt    = bas->rootpackopt;
129*eb02082bSJunchao Zhang     p[0].buf    = link->selfbuf[link->rootmtype]; p[1].buf    = link->rootbuf[link->rootmtype];
130*eb02082bSJunchao Zhang     p[0].atomic = PETSC_FALSE;                    p[1].atomic = PETSC_FALSE;
131*eb02082bSJunchao Zhang   } else {
132*eb02082bSJunchao Zhang     /* For SFAllgatherv etc collectives, which have a dense root space and do not differentiate self/remote communication. */
133*eb02082bSJunchao Zhang     p[0].count  = sf->nroots;
134*eb02082bSJunchao Zhang     p[0].idx    = NULL;
135*eb02082bSJunchao Zhang     p[0].opt    = NULL;
136*eb02082bSJunchao Zhang     p[0].buf    = link->rootbuf[link->rootmtype];
137*eb02082bSJunchao Zhang     p[0].atomic = PETSC_FALSE;
138*eb02082bSJunchao Zhang     p[1].count  = 0;
139*eb02082bSJunchao Zhang   }
140*eb02082bSJunchao Zhang 
141*eb02082bSJunchao Zhang   ierr = PetscSFPackGetPack(link,link->rootmtype,&Pack);CHKERRQ(ierr);
142b23bfdefSJunchao Zhang   /* Only do packing when count != 0 so that we can avoid invoking CUDA kernels on GPU. */
143*eb02082bSJunchao Zhang   for (i=0; i<2; i++) {if (p[i].count) {ierr = (*Pack)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);}}
144*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
145*eb02082bSJunchao Zhang   /* Let's assume MPI does not do cudaDeviceSynchronize() at entrance, which I do not know. Then we have to sync the packing kernel to make rootbuf & selfbuf ready for MPI & self to self copy */
146*eb02082bSJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);}
147*eb02082bSJunchao Zhang #endif
148b23bfdefSJunchao Zhang   PetscFunctionReturn(0);
149b23bfdefSJunchao Zhang }
150b23bfdefSJunchao Zhang 
151b23bfdefSJunchao Zhang /* Utility routine to pack selected entries of leafdata into leaf buffer */
152*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackLeafData(PetscSF sf,PetscSFPack link,const PetscInt *leafloc,const void *leafdata,PetscBool sparse)
153b23bfdefSJunchao Zhang {
154b23bfdefSJunchao Zhang   PetscErrorCode ierr;
155b23bfdefSJunchao Zhang   PetscInt       i;
156*eb02082bSJunchao Zhang   PetscErrorCode (*Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*);
157*eb02082bSJunchao Zhang   PackInfo       p[2];
158b23bfdefSJunchao Zhang 
159b23bfdefSJunchao Zhang   PetscFunctionBegin;
160*eb02082bSJunchao Zhang   if (sparse) {
161*eb02082bSJunchao Zhang     p[0].count  = sf->roffset[sf->ndranks];       p[1].count  = sf->roffset[sf->nranks]-sf->roffset[sf->ndranks];
162*eb02082bSJunchao Zhang     p[0].idx    = leafloc;                        p[1].idx    = leafloc+sf->roffset[sf->ndranks];
163*eb02082bSJunchao Zhang     p[0].opt    = sf->selfleafpackopt;            p[1].opt    = sf->leafpackopt;
164*eb02082bSJunchao Zhang     p[0].buf    = link->selfbuf[link->leafmtype]; p[1].buf    = link->leafbuf[link->leafmtype];
165*eb02082bSJunchao Zhang     p[0].atomic = PETSC_FALSE;                    p[1].atomic = PETSC_FALSE;
166*eb02082bSJunchao Zhang   } else {
167*eb02082bSJunchao Zhang     p[0].count  = sf->nleaves;
168*eb02082bSJunchao Zhang     p[0].idx    = NULL;
169*eb02082bSJunchao Zhang     p[0].opt    = NULL;
170*eb02082bSJunchao Zhang     p[0].buf    = link->leafbuf[link->leafmtype];
171*eb02082bSJunchao Zhang     p[0].atomic = PETSC_FALSE;
172*eb02082bSJunchao Zhang     p[1].count  = 0;
173*eb02082bSJunchao Zhang   }
174*eb02082bSJunchao Zhang 
175*eb02082bSJunchao Zhang   ierr = PetscSFPackGetPack(link,link->leafmtype,&Pack);CHKERRQ(ierr);
176*eb02082bSJunchao Zhang   for (i=0; i<2; i++) {if (p[i].count) {ierr = (*Pack)(p[i].count,p[i].idx,link,p[i].opt,leafdata,p[i].buf);CHKERRQ(ierr);}}
177*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
178*eb02082bSJunchao Zhang   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);}
179*eb02082bSJunchao Zhang #endif
180b23bfdefSJunchao Zhang   PetscFunctionReturn(0);
181b23bfdefSJunchao Zhang }
182b23bfdefSJunchao Zhang 
183b23bfdefSJunchao Zhang /* Utility routine to unpack data from root buffer and Op it into selected entries of rootdata */
184*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,void *rootdata,MPI_Op op,PetscBool sparse)
185b23bfdefSJunchao Zhang {
186b23bfdefSJunchao Zhang   PetscErrorCode ierr;
187b23bfdefSJunchao Zhang   PetscInt       i;
188b23bfdefSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
189*eb02082bSJunchao Zhang   PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
190*eb02082bSJunchao Zhang   PackInfo       p[2];
191b23bfdefSJunchao Zhang 
192b23bfdefSJunchao Zhang   PetscFunctionBegin;
193*eb02082bSJunchao Zhang   if (sparse) {
194*eb02082bSJunchao Zhang     p[0].count  = bas->ioffset[bas->ndiranks];    p[1].count  = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks];
195*eb02082bSJunchao Zhang     p[0].idx    = rootloc;                        p[1].idx    = rootloc+bas->ioffset[bas->ndiranks];
196*eb02082bSJunchao Zhang     p[0].opt    = bas->selfrootpackopt;           p[1].opt    = bas->rootpackopt;
197*eb02082bSJunchao Zhang     p[0].buf    = link->selfbuf[link->rootmtype]; p[1].buf    = link->rootbuf[link->rootmtype];
198*eb02082bSJunchao Zhang     p[0].atomic = bas->selfrootdups;              p[1].atomic = bas->remoterootdups;
199*eb02082bSJunchao Zhang   } else {
200*eb02082bSJunchao Zhang     p[0].count  = sf->nroots;
201*eb02082bSJunchao Zhang     p[0].idx    = NULL;
202*eb02082bSJunchao Zhang     p[0].opt    = NULL;
203*eb02082bSJunchao Zhang     p[0].buf    = link->rootbuf[link->rootmtype];
204*eb02082bSJunchao Zhang     p[0].atomic = PETSC_FALSE;
205*eb02082bSJunchao Zhang     p[1].count  = 0;
206*eb02082bSJunchao Zhang   }
207*eb02082bSJunchao Zhang 
208*eb02082bSJunchao Zhang   for (i=0; i<2; i++) {
209*eb02082bSJunchao Zhang     if (p[i].count) {
210*eb02082bSJunchao Zhang       ierr = PetscSFPackGetUnpackAndOp(link,link->rootmtype,op,p[i].atomic,&UnpackAndOp);CHKERRQ(ierr);
211*eb02082bSJunchao Zhang       if (UnpackAndOp) {ierr = (*UnpackAndOp)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);}
212*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
213*eb02082bSJunchao Zhang       else {
214*eb02082bSJunchao Zhang         PetscInt    j;
215*eb02082bSJunchao Zhang         PetscMPIInt n;
216*eb02082bSJunchao Zhang         if (p[i].idx) {
217*eb02082bSJunchao Zhang           /* Note if done on GPU, this can be very slow due to the huge number of kernel calls. The op is likely user-defined. We must
218*eb02082bSJunchao Zhang              use link->unit (instead of link->basicunit) as the datatype and 1 (instead of link->bs) as the count in MPI_Reduce_local.
219*eb02082bSJunchao Zhang            */
220*eb02082bSJunchao Zhang           for (j=0; j<p[i].count; j++) {ierr = MPI_Reduce_local(p[i].buf+j*link->unitbytes,(char *)rootdata+p[i].idx[j]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
221*eb02082bSJunchao Zhang         } else {
222*eb02082bSJunchao Zhang           ierr = PetscMPIIntCast(p[i].count,&n);CHKERRQ(ierr);
223*eb02082bSJunchao Zhang           ierr = MPI_Reduce_local(p[i].buf,rootdata,n,link->unit,op);CHKERRQ(ierr);
224*eb02082bSJunchao Zhang         }
225*eb02082bSJunchao Zhang       }
226*eb02082bSJunchao Zhang #else
227*eb02082bSJunchao Zhang     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
228*eb02082bSJunchao Zhang #endif
229*eb02082bSJunchao Zhang     }
230*eb02082bSJunchao Zhang   }
231*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
232*eb02082bSJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);}
233*eb02082bSJunchao Zhang #endif
234b23bfdefSJunchao Zhang   PetscFunctionReturn(0);
235b23bfdefSJunchao Zhang }
236b23bfdefSJunchao Zhang 
237*eb02082bSJunchao Zhang /* Utility routine to unpack data from leaf buffer and Op it into selected entries of leafdata */
238*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFUnpackAndOpLeafData(PetscSF sf,PetscSFPack link,const PetscInt *leafloc,void *leafdata,MPI_Op op,PetscBool sparse)
239*eb02082bSJunchao Zhang {
240*eb02082bSJunchao Zhang   PetscErrorCode ierr;
241*eb02082bSJunchao Zhang   PetscInt       i;
242*eb02082bSJunchao Zhang   PetscErrorCode (*UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
243*eb02082bSJunchao Zhang   PackInfo       p[2];
244b23bfdefSJunchao Zhang 
245*eb02082bSJunchao Zhang   PetscFunctionBegin;
246*eb02082bSJunchao Zhang   if (sparse) {
247*eb02082bSJunchao Zhang     p[0].count  = sf->roffset[sf->ndranks];       p[1].count  = sf->roffset[sf->nranks]-sf->roffset[sf->ndranks];
248*eb02082bSJunchao Zhang     p[0].idx    = leafloc;                        p[1].idx    = leafloc+sf->roffset[sf->ndranks];
249*eb02082bSJunchao Zhang     p[0].opt    = sf->selfleafpackopt;            p[1].opt    = sf->leafpackopt;
250*eb02082bSJunchao Zhang     p[0].buf    = link->selfbuf[link->leafmtype]; p[1].buf    = link->leafbuf[link->leafmtype];
251*eb02082bSJunchao Zhang     p[0].atomic = sf->selfleafdups;               p[1].atomic = sf->remoteleafdups;
252*eb02082bSJunchao Zhang   } else {
253*eb02082bSJunchao Zhang     p[0].count  = sf->nleaves;
254*eb02082bSJunchao Zhang     p[0].idx    = NULL;
255*eb02082bSJunchao Zhang     p[0].opt    = NULL;
256*eb02082bSJunchao Zhang     p[0].buf    = link->leafbuf[link->leafmtype];
257*eb02082bSJunchao Zhang     p[0].atomic = PETSC_FALSE;
258*eb02082bSJunchao Zhang     p[1].count  = 0;
259*eb02082bSJunchao Zhang   }
260*eb02082bSJunchao Zhang 
261*eb02082bSJunchao Zhang   for (i=0; i<2; i++) {
262*eb02082bSJunchao Zhang     if (p[i].count) {
263*eb02082bSJunchao Zhang       ierr = PetscSFPackGetUnpackAndOp(link,link->leafmtype,op,p[i].atomic,&UnpackAndOp);CHKERRQ(ierr);
264*eb02082bSJunchao Zhang       if (UnpackAndOp) {ierr = (*UnpackAndOp)(p[i].count,p[i].idx,link,p[i].opt,leafdata,p[i].buf);CHKERRQ(ierr);}
265*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
266*eb02082bSJunchao Zhang       else {
267*eb02082bSJunchao Zhang         PetscInt    j;
268*eb02082bSJunchao Zhang         PetscMPIInt n;
269*eb02082bSJunchao Zhang         if (p[i].idx) {
270*eb02082bSJunchao Zhang           for (j=0; j<p[i].count; j++) {ierr = MPI_Reduce_local(p[i].buf+j*link->unitbytes,(char *)leafdata+p[i].idx[j]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);}
271*eb02082bSJunchao Zhang         } else {
272*eb02082bSJunchao Zhang           ierr = PetscMPIIntCast(p[i].count,&n);CHKERRQ(ierr);
273*eb02082bSJunchao Zhang           ierr = MPI_Reduce_local(p[i].buf,leafdata,n,link->unit,op);CHKERRQ(ierr);
274*eb02082bSJunchao Zhang         }
275*eb02082bSJunchao Zhang       }
276*eb02082bSJunchao Zhang #else
277*eb02082bSJunchao Zhang     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
278*eb02082bSJunchao Zhang #endif
279*eb02082bSJunchao Zhang     }
280*eb02082bSJunchao Zhang   }
281*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
282*eb02082bSJunchao Zhang   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);}
283*eb02082bSJunchao Zhang #endif
284*eb02082bSJunchao Zhang   PetscFunctionReturn(0);
285*eb02082bSJunchao Zhang }
286*eb02082bSJunchao Zhang 
287*eb02082bSJunchao Zhang /* Utility routine to fetch and Op selected entries of rootdata */
288*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFFetchAndOpRootData(PetscSF sf,PetscSFPack link,const PetscInt *rootloc,void *rootdata,MPI_Op op,PetscBool sparse)
289*eb02082bSJunchao Zhang {
290*eb02082bSJunchao Zhang   PetscErrorCode ierr;
291*eb02082bSJunchao Zhang   PetscInt       i;
292*eb02082bSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
293*eb02082bSJunchao Zhang   PetscErrorCode (*FetchAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*);
294*eb02082bSJunchao Zhang   PackInfo       p[2];
295*eb02082bSJunchao Zhang 
296*eb02082bSJunchao Zhang   PetscFunctionBegin;
297*eb02082bSJunchao Zhang   if (sparse) {
298*eb02082bSJunchao Zhang     /* For SFBasic and SFNeighbor, whose root space is sparse and have separate buffers for self and remote. */
299*eb02082bSJunchao Zhang     p[0].count  = bas->ioffset[bas->ndiranks];    p[1].count  = bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks];
300*eb02082bSJunchao Zhang     p[0].idx    = rootloc;                        p[1].idx    = rootloc+bas->ioffset[bas->ndiranks];
301*eb02082bSJunchao Zhang     p[0].opt    = bas->selfrootpackopt;           p[1].opt    = bas->rootpackopt;
302*eb02082bSJunchao Zhang     p[0].buf    = link->selfbuf[link->rootmtype]; p[1].buf    = link->rootbuf[link->rootmtype];
303*eb02082bSJunchao Zhang     p[0].atomic = bas->selfrootdups;              p[1].atomic = bas->remoterootdups;
304*eb02082bSJunchao Zhang   } else {
305*eb02082bSJunchao Zhang     p[0].count  = sf->nroots;
306*eb02082bSJunchao Zhang     p[0].idx    = NULL;
307*eb02082bSJunchao Zhang     p[0].opt    = NULL;
308*eb02082bSJunchao Zhang     p[0].buf    = link->rootbuf[link->rootmtype];
309*eb02082bSJunchao Zhang     p[0].atomic = PETSC_FALSE;
310*eb02082bSJunchao Zhang     p[1].count  = 0;
311*eb02082bSJunchao Zhang   }
312*eb02082bSJunchao Zhang 
313*eb02082bSJunchao Zhang   for (i=0; i<2; i++) {
314*eb02082bSJunchao Zhang     if (p[i].count) {
315*eb02082bSJunchao Zhang       ierr = PetscSFPackGetFetchAndOp(link,link->rootmtype,op,p[i].atomic,&FetchAndOp);CHKERRQ(ierr);
316*eb02082bSJunchao Zhang       ierr = (*FetchAndOp)(p[i].count,p[i].idx,link,p[i].opt,rootdata,p[i].buf);CHKERRQ(ierr);
317*eb02082bSJunchao Zhang     }
318*eb02082bSJunchao Zhang   }
319*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
320*eb02082bSJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (p[0].count || p[1].count)) {cudaError_t err = cudaStreamSynchronize(link->stream);CHKERRCUDA(err);}
321*eb02082bSJunchao Zhang #endif
322*eb02082bSJunchao Zhang   PetscFunctionReturn(0);
323*eb02082bSJunchao Zhang }
324*eb02082bSJunchao Zhang 
325*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackSetupOptimizations_Basic(PetscSF sf)
326b23bfdefSJunchao Zhang {
327b23bfdefSJunchao Zhang   PetscErrorCode ierr;
328b23bfdefSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
329b23bfdefSJunchao Zhang 
330b23bfdefSJunchao Zhang   PetscFunctionBegin;
331*eb02082bSJunchao Zhang   ierr = PetscSFPackOptCreate(sf->ndranks,               sf->roffset,               sf->rmine,    &sf->selfleafpackopt);CHKERRQ(ierr);
332*eb02082bSJunchao Zhang   ierr = PetscSFPackOptCreate(sf->nranks-sf->ndranks,    sf->roffset+sf->ndranks,   sf->rmine,    &sf->leafpackopt);CHKERRQ(ierr);
333*eb02082bSJunchao Zhang   ierr = PetscSFPackOptCreate(bas->ndiranks,             bas->ioffset,              bas->irootloc,&bas->selfrootpackopt);CHKERRQ(ierr);
334*eb02082bSJunchao Zhang   ierr = PetscSFPackOptCreate(bas->niranks-bas->ndiranks,bas->ioffset+bas->ndiranks,bas->irootloc,&bas->rootpackopt);CHKERRQ(ierr);
335*eb02082bSJunchao Zhang 
336*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
337*eb02082bSJunchao Zhang   /* Check duplicates in irootloc[] so CUDA packing kernels can use cheaper regular operations
338*eb02082bSJunchao Zhang      instead of atomics to unpack data on leaves/roots, when they know there is not data race.
339*eb02082bSJunchao Zhang    */
340*eb02082bSJunchao Zhang   ierr = PetscCheckDupsInt(sf->roffset[sf->ndranks],                              sf->rmine,                                &sf->selfleafdups);CHKERRQ(ierr);
341*eb02082bSJunchao Zhang   ierr = PetscCheckDupsInt(sf->roffset[sf->nranks]-sf->roffset[sf->ndranks],      sf->rmine+sf->roffset[sf->ndranks],       &sf->remoteleafdups);CHKERRQ(ierr);
342*eb02082bSJunchao Zhang   ierr = PetscCheckDupsInt(bas->ioffset[bas->ndiranks],                           bas->irootloc,                            &bas->selfrootdups);CHKERRQ(ierr);
343*eb02082bSJunchao Zhang   ierr = PetscCheckDupsInt(bas->ioffset[bas->niranks]-bas->ioffset[bas->ndiranks],bas->irootloc+bas->ioffset[bas->ndiranks],&bas->remoterootdups);CHKERRQ(ierr);
344*eb02082bSJunchao Zhang #endif
345b23bfdefSJunchao Zhang   PetscFunctionReturn(0);
346b23bfdefSJunchao Zhang }
347b23bfdefSJunchao Zhang 
348*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackDestroyOptimizations_Basic(PetscSF sf)
349b23bfdefSJunchao Zhang {
350b23bfdefSJunchao Zhang   PetscErrorCode ierr;
351b23bfdefSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
352b23bfdefSJunchao Zhang 
353b23bfdefSJunchao Zhang   PetscFunctionBegin;
354*eb02082bSJunchao Zhang   ierr = PetscSFPackOptDestory(&sf->leafpackopt);CHKERRQ(ierr);
355*eb02082bSJunchao Zhang   ierr = PetscSFPackOptDestory(&sf->selfleafpackopt);CHKERRQ(ierr);
356*eb02082bSJunchao Zhang   ierr = PetscSFPackOptDestory(&bas->rootpackopt);CHKERRQ(ierr);
357*eb02082bSJunchao Zhang   ierr = PetscSFPackOptDestory(&bas->selfrootpackopt);CHKERRQ(ierr);
358*eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
359*eb02082bSJunchao Zhang   sf->selfleafdups    = PETSC_TRUE;
360*eb02082bSJunchao Zhang   sf->remoteleafdups  = PETSC_TRUE;
361*eb02082bSJunchao Zhang   bas->selfrootdups   = PETSC_TRUE; /* The default is assuming there are dups so that atomics are used. */
362*eb02082bSJunchao Zhang   bas->remoterootdups = PETSC_TRUE;
363*eb02082bSJunchao Zhang #endif
364b23bfdefSJunchao Zhang   PetscFunctionReturn(0);
365b23bfdefSJunchao Zhang }
366b23bfdefSJunchao Zhang 
367*eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackWaitall_Basic(PetscSFPack link,PetscSFDirection direction)
36840e23c03SJunchao Zhang {
36940e23c03SJunchao Zhang   PetscErrorCode ierr;
37040e23c03SJunchao Zhang 
37140e23c03SJunchao Zhang   PetscFunctionBegin;
372*eb02082bSJunchao Zhang   ierr = MPI_Waitall(link->nrootreqs,link->rootreqs[direction][link->rootmtype],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
373*eb02082bSJunchao Zhang   ierr = MPI_Waitall(link->nleafreqs,link->leafreqs[direction][link->leafmtype],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
37440e23c03SJunchao Zhang   PetscFunctionReturn(0);
37540e23c03SJunchao Zhang }
37640e23c03SJunchao Zhang 
37740e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUp_Basic(PetscSF);
37840e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFView_Basic(PetscSF,PetscViewer);
37940e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFReset_Basic(PetscSF);
38040e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFDestroy_Basic(PetscSF);
381*eb02082bSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFBcastAndOpEnd_Basic  (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,      void*,      MPI_Op);
382*eb02082bSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFReduceEnd_Basic      (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,      void*,      MPI_Op);
383*eb02082bSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFFetchAndOpBegin_Basic(PetscSF,MPI_Datatype,PetscMemType,      void*,PetscMemType,const void*,void*,MPI_Op);
384f659e5c7SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*);
385f659e5c7SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFCreateEmbeddedLeafSF_Basic(PetscSF,PetscInt,const PetscInt*,PetscSF*);
38640e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFGetLeafRanks_Basic(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**);
387*eb02082bSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackGet_Basic_Common(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,PetscInt,PetscInt,PetscSFPack*);
38840e23c03SJunchao Zhang #endif
389