140e23c03SJunchao Zhang 240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 440e23c03SJunchao Zhang 5eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 6eb02082bSJunchao Zhang #include <cuda_runtime.h> 7eb02082bSJunchao Zhang #endif 840e23c03SJunchao Zhang /* 940e23c03SJunchao Zhang * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 1040e23c03SJunchao Zhang * therefore we pack data types manually. This file defines packing routines for the standard data types. 1140e23c03SJunchao Zhang */ 1240e23c03SJunchao Zhang 13cd620004SJunchao Zhang #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d 1440e23c03SJunchao Zhang 15cd620004SJunchao Zhang /* Operations working like s += t */ 16cd620004SJunchao Zhang #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */ 17cd620004SJunchao Zhang #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */ 18cd620004SJunchao Zhang #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */ 19cd620004SJunchao Zhang #define OP_ASSIGN(op,s,t) do {(s) = (t);} while(0) 20cd620004SJunchao Zhang /* Ref MPI MAXLOC */ 21cd620004SJunchao Zhang #define OP_XLOC(op,s,t) \ 22cd620004SJunchao Zhang do { \ 23cd620004SJunchao Zhang if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ 24cd620004SJunchao Zhang else if (!((s).u op (t).u)) s = t; \ 25cd620004SJunchao Zhang } while(0) 2640e23c03SJunchao Zhang 2740e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine 2840e23c03SJunchao Zhang 2940e23c03SJunchao Zhang Arguments of the macro: 30b23bfdefSJunchao Zhang +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 31b23bfdefSJunchao Zhang .BS Block size for vectorization. It is a factor of bs. 32b23bfdefSJunchao Zhang -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 3340e23c03SJunchao Zhang 3440e23c03SJunchao Zhang Arguments of the Pack routine: 35cd620004SJunchao Zhang +count Number of indices in idx[]. 36cd620004SJunchao Zhang .start If indices are contiguous, it is the first index; otherwise, not used. 37cd620004SJunchao Zhang .idx Indices of entries to packed. NULL means contiguous indices, that is [start,start+count). 38eb02082bSJunchao Zhang .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 39cd620004SJunchao Zhang .opt Per-remote pack optimization plan. NULL means no such plan. 40cd620004SJunchao Zhang .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 41cd620004SJunchao Zhang -packed Address of the packed data. 4240e23c03SJunchao Zhang */ 43b23bfdefSJunchao Zhang #define DEF_PackFunc(Type,BS,EQ) \ 44cd620004SJunchao Zhang static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,const void *unpacked,void *packed) \ 45b23bfdefSJunchao Zhang { \ 4640e23c03SJunchao Zhang PetscErrorCode ierr; \ 47b23bfdefSJunchao Zhang const Type *u = (const Type*)unpacked,*u2; \ 48b23bfdefSJunchao Zhang Type *p = (Type*)packed,*p2; \ 49eb02082bSJunchao Zhang PetscInt i,j,k,l,r,step,bs=link->bs; \ 50b23bfdefSJunchao Zhang const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 51b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 5240e23c03SJunchao Zhang PetscFunctionBegin; \ 53cd620004SJunchao Zhang if (!idx) {ierr = PetscArraycpy(p,u+start*bs,MBS*count);CHKERRQ(ierr);} /* Indices are contiguous */ \ 54b23bfdefSJunchao Zhang else if (!opt) { /* No optimizations available */ \ 55b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 56eb02082bSJunchao Zhang for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 57eb02082bSJunchao Zhang for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 58b23bfdefSJunchao Zhang p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \ 59b23bfdefSJunchao Zhang } else { \ 60b23bfdefSJunchao Zhang for (r=0; r<opt->n; r++) { \ 61b23bfdefSJunchao Zhang p2 = p + opt->offset[r]*MBS; \ 62b23bfdefSJunchao Zhang if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 63b23bfdefSJunchao Zhang idx2 = idx + opt->offset[r]; \ 64b23bfdefSJunchao Zhang for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 65b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 66b23bfdefSJunchao Zhang for (k=0; k<BS; k++) \ 67b23bfdefSJunchao Zhang p2[i*MBS+j*BS+k] = u[idx2[i]*MBS+j*BS+k]; \ 68b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 6940e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { \ 70b23bfdefSJunchao Zhang u2 = u + idx[opt->copy_start[i]]*MBS; \ 71b23bfdefSJunchao Zhang l = opt->copy_length[i]*MBS; /* length in basic type such as MPI_INT */ \ 72b23bfdefSJunchao Zhang ierr = PetscArraycpy(p2,u2,l);CHKERRQ(ierr); \ 73b23bfdefSJunchao Zhang p2 += l; \ 7440e23c03SJunchao Zhang } \ 75b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 76b23bfdefSJunchao Zhang u2 = u + idx[opt->offset[r]]*MBS; \ 7740e23c03SJunchao Zhang step = opt->stride_step[r]; \ 7840e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 79b23bfdefSJunchao Zhang for (j=0; j<MBS; j++) p2[i*MBS+j] = u2[i*step*MBS+j]; \ 80b23bfdefSJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 8140e23c03SJunchao Zhang } \ 8240e23c03SJunchao Zhang } \ 8340e23c03SJunchao Zhang PetscFunctionReturn(0); \ 8440e23c03SJunchao Zhang } 8540e23c03SJunchao Zhang 86cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 87cd620004SJunchao Zhang and inserts into a sparse array. 8840e23c03SJunchao Zhang 8940e23c03SJunchao Zhang Arguments: 90b23bfdefSJunchao Zhang .Type Type of the data 9140e23c03SJunchao Zhang .BS Block size for vectorization 92b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 9340e23c03SJunchao Zhang 9440e23c03SJunchao Zhang Notes: 9540e23c03SJunchao Zhang This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 9640e23c03SJunchao Zhang */ 97cd620004SJunchao Zhang #define DEF_UnpackFunc(Type,BS,EQ) \ 98cd620004SJunchao Zhang static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \ 99b23bfdefSJunchao Zhang { \ 10040e23c03SJunchao Zhang PetscErrorCode ierr; \ 101b23bfdefSJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 102cd620004SJunchao Zhang const Type *p = (const Type*)packed,*p2; \ 103eb02082bSJunchao Zhang PetscInt i,j,k,l,r,step,bs=link->bs; \ 104b23bfdefSJunchao Zhang const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 105b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 10640e23c03SJunchao Zhang PetscFunctionBegin; \ 107b23bfdefSJunchao Zhang if (!idx) { \ 108cd620004SJunchao Zhang u2 = u + start*bs; \ 109cd620004SJunchao Zhang if (u2 != p) {ierr = PetscArraycpy(u2,p,count*MBS);CHKERRQ(ierr);} \ 110b23bfdefSJunchao Zhang } else if (!opt) { /* No optimizations available */ \ 111b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 112b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 113cd620004SJunchao Zhang for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \ 114b23bfdefSJunchao Zhang } else { \ 115b23bfdefSJunchao Zhang for (r=0; r<opt->n; r++) { \ 116b23bfdefSJunchao Zhang p2 = p + opt->offset[r]*MBS; \ 117b23bfdefSJunchao Zhang if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 118b23bfdefSJunchao Zhang idx2 = idx + opt->offset[r]; \ 119b23bfdefSJunchao Zhang for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 120b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 121cd620004SJunchao Zhang for (k=0; k<BS; k++) u[idx2[i]*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 122b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 12340e23c03SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 124b23bfdefSJunchao Zhang u2 = u + idx[opt->copy_start[i]]*MBS; \ 125b23bfdefSJunchao Zhang l = opt->copy_length[i]*MBS; \ 126b23bfdefSJunchao Zhang ierr = PetscArraycpy(u2,p2,l);CHKERRQ(ierr); \ 127b23bfdefSJunchao Zhang p2 += l; \ 12840e23c03SJunchao Zhang } \ 129b23bfdefSJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 130b23bfdefSJunchao Zhang u2 = u + idx[opt->offset[r]]*MBS; \ 13140e23c03SJunchao Zhang step = opt->stride_step[r]; \ 13240e23c03SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 133b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 134cd620004SJunchao Zhang for (k=0; k<BS; k++) u2[i*step*MBS+j*BS+k] = p2[i*MBS+j*BS+k]; \ 135b23bfdefSJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 13640e23c03SJunchao Zhang } \ 13740e23c03SJunchao Zhang } \ 13840e23c03SJunchao Zhang PetscFunctionReturn(0); \ 13940e23c03SJunchao Zhang } 14040e23c03SJunchao Zhang 141cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 14240e23c03SJunchao Zhang 14340e23c03SJunchao Zhang Arguments: 144cd620004SJunchao Zhang +Opname Name of the Op, such as Add, Mult, LAND, etc. 145b23bfdefSJunchao Zhang .Type Type of the data 14640e23c03SJunchao Zhang .BS Block size for vectorization 147b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 148cd620004SJunchao Zhang .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 149cd620004SJunchao Zhang .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 15040e23c03SJunchao Zhang */ 151cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 152cd620004SJunchao Zhang static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,const void *packed) \ 153b23bfdefSJunchao Zhang { \ 154cd620004SJunchao Zhang Type *u = (Type*)unpacked,*u2; \ 155cd620004SJunchao Zhang const Type *p = (const Type*)packed,*p2; \ 156eb02082bSJunchao Zhang PetscInt i,j,k,l,r,step,bs=link->bs; \ 157b23bfdefSJunchao Zhang const PetscInt *idx2,M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 158b23bfdefSJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 15940e23c03SJunchao Zhang PetscFunctionBegin; \ 160b23bfdefSJunchao Zhang if (!idx) { \ 161cd620004SJunchao Zhang u += start*bs; \ 162cd620004SJunchao Zhang for (i=0; i<count; i++) \ 163cd620004SJunchao Zhang for (j=0; j<M; j++) \ 164cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 165cd620004SJunchao Zhang OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 166cd620004SJunchao Zhang } else if (!opt) { /* No optimizations available */ \ 167cd620004SJunchao Zhang for (i=0; i<count; i++) \ 168cd620004SJunchao Zhang for (j=0; j<M; j++) \ 169cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 170cd620004SJunchao Zhang OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 171cd620004SJunchao Zhang } else { \ 172cd620004SJunchao Zhang for (r=0; r<opt->n; r++) { \ 173cd620004SJunchao Zhang p2 = p + opt->offset[r]*MBS; \ 174cd620004SJunchao Zhang if (opt->type[r] == PETSCSF_PACKOPT_NONE) { \ 175cd620004SJunchao Zhang idx2 = idx + opt->offset[r]; \ 176cd620004SJunchao Zhang for (i=0; i<opt->offset[r+1]-opt->offset[r]; i++) \ 177cd620004SJunchao Zhang for (j=0; j<M; j++) \ 178cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 179cd620004SJunchao Zhang OpApply(Op,u[idx2[i]*MBS+j*BS+k],p2[i*MBS+j*BS+k]); \ 180cd620004SJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_MULTICOPY) { \ 181cd620004SJunchao Zhang for (i=opt->copy_offset[r]; i<opt->copy_offset[r+1]; i++) { /* i-th piece */ \ 182cd620004SJunchao Zhang u2 = u + idx[opt->copy_start[i]]*MBS; \ 183cd620004SJunchao Zhang l = opt->copy_length[i]*MBS; \ 184cd620004SJunchao Zhang for (j=0; j<l; j++) OpApply(Op,u2[j],p2[j]); \ 185cd620004SJunchao Zhang p2 += l; \ 186cd620004SJunchao Zhang } \ 187cd620004SJunchao Zhang } else if (opt->type[r] == PETSCSF_PACKOPT_STRIDE) { \ 188cd620004SJunchao Zhang u2 = u + idx[opt->offset[r]]*MBS; \ 189cd620004SJunchao Zhang step = opt->stride_step[r]; \ 190cd620004SJunchao Zhang for (i=0; i<opt->stride_n[r]; i++) \ 191cd620004SJunchao Zhang for (j=0; j<M; j++) \ 192cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 193cd620004SJunchao Zhang OpApply(Op,u2[i*step*MBS+j*BS+k],p2[i*MBS+j*BS+k]); \ 194cd620004SJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unknown SFPack optimzation type %D",opt->type[r]); \ 195cd620004SJunchao Zhang } \ 196cd620004SJunchao Zhang } \ 197cd620004SJunchao Zhang PetscFunctionReturn(0); \ 198cd620004SJunchao Zhang } 199cd620004SJunchao Zhang 200cd620004SJunchao Zhang #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 201cd620004SJunchao Zhang static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *idx,PetscSFPackOpt opt,void *unpacked,void *packed) \ 202cd620004SJunchao Zhang { \ 203cd620004SJunchao Zhang Type *u = (Type*)unpacked,*p = (Type*)packed,t; \ 204cd620004SJunchao Zhang PetscInt i,j,k,bs=link->bs; \ 205cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 206cd620004SJunchao Zhang const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 207cd620004SJunchao Zhang PetscFunctionBegin; \ 208cd620004SJunchao Zhang if (!idx) { \ 209cd620004SJunchao Zhang u += start*bs; \ 210b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 211b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 212b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 213b23bfdefSJunchao Zhang t = u[i*MBS+j*BS+k]; \ 214cd620004SJunchao Zhang OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 215cd620004SJunchao Zhang p[i*MBS+j*BS+k] = t; \ 21640e23c03SJunchao Zhang } \ 217cd620004SJunchao Zhang } else { \ 218b23bfdefSJunchao Zhang for (i=0; i<count; i++) \ 219b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 220b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 221b23bfdefSJunchao Zhang t = u[idx[i]*MBS+j*BS+k]; \ 222cd620004SJunchao Zhang OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \ 223cd620004SJunchao Zhang p[i*MBS+j*BS+k] = t; \ 224cd620004SJunchao Zhang } \ 225cd620004SJunchao Zhang } \ 226cd620004SJunchao Zhang PetscFunctionReturn(0); \ 227cd620004SJunchao Zhang } 228cd620004SJunchao Zhang 229cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \ 230cd620004SJunchao Zhang static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt startx,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata) \ 231cd620004SJunchao Zhang { \ 232cd620004SJunchao Zhang const Type *x = (const Type*)xdata; \ 233cd620004SJunchao Zhang Type *y = (Type*)ydata; \ 234cd620004SJunchao Zhang PetscInt i,j,k,bs = link->bs; \ 235cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 236cd620004SJunchao Zhang const PetscInt MBS = M*BS; \ 237cd620004SJunchao Zhang PetscFunctionBegin; \ 238cd620004SJunchao Zhang if (!idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use UnpackAndOp instead of ScatterAndOp");\ 239cd620004SJunchao Zhang if (!idy) { \ 240cd620004SJunchao Zhang y += starty*bs; \ 241cd620004SJunchao Zhang for (i=0; i<count; i++) \ 242cd620004SJunchao Zhang for (j=0; j<M; j++) \ 243cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 244cd620004SJunchao Zhang OpApply(Op,y[i*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]); \ 245cd620004SJunchao Zhang } else { \ 246cd620004SJunchao Zhang for (i=0; i<count; i++) \ 247cd620004SJunchao Zhang for (j=0; j<M; j++) \ 248cd620004SJunchao Zhang for (k=0; k<BS; k++) \ 249cd620004SJunchao Zhang OpApply(Op,y[idy[i]*MBS+j*BS+k],x[idx[i]*MBS+j*BS+k]); \ 250cd620004SJunchao Zhang } \ 251cd620004SJunchao Zhang PetscFunctionReturn(0); \ 252cd620004SJunchao Zhang } 253cd620004SJunchao Zhang 254cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \ 255cd620004SJunchao Zhang static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,const PetscInt *rootindices,void *rootdata,PetscInt leafstart,const PetscInt *leafindices,const void *leafdata,void *leafupdate) \ 256cd620004SJunchao Zhang { \ 257cd620004SJunchao Zhang Type *x = (Type*)rootdata,*y2 = (Type*)leafupdate; \ 258cd620004SJunchao Zhang const Type *y = (const Type*)leafdata; \ 259cd620004SJunchao Zhang PetscInt i,j,k,bs = link->bs; \ 260cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs/BS; \ 261cd620004SJunchao Zhang const PetscInt MBS = M*BS; \ 262cd620004SJunchao Zhang PetscFunctionBegin; \ 263cd620004SJunchao Zhang if (!rootindices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Petsc should use FetchAndOp instead of FetchAndOpLocal");\ 264cd620004SJunchao Zhang if (!leafindices) { \ 265cd620004SJunchao Zhang y += leafstart*bs; y2 += leafstart*bs; \ 266cd620004SJunchao Zhang for (i=0; i<count; i++) \ 267cd620004SJunchao Zhang for (j=0; j<M; j++) \ 268cd620004SJunchao Zhang for (k=0; k<BS; k++) { \ 269cd620004SJunchao Zhang y2[i*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k]; \ 270cd620004SJunchao Zhang OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[i*MBS+j*BS+k]); \ 27140e23c03SJunchao Zhang } \ 272b23bfdefSJunchao Zhang } else { \ 273cd620004SJunchao Zhang for (i=0; i<count; i++) \ 274b23bfdefSJunchao Zhang for (j=0; j<M; j++) \ 275b23bfdefSJunchao Zhang for (k=0; k<BS; k++) { \ 276cd620004SJunchao Zhang y2[leafindices[i]*MBS+j*BS+k] = x[rootindices[i]*MBS+j*BS+k]; \ 277cd620004SJunchao Zhang OpApply(Op,x[rootindices[i]*MBS+j*BS+k],y[leafindices[i]*MBS+j*BS+k]); \ 27840e23c03SJunchao Zhang } \ 27940e23c03SJunchao Zhang } \ 28040e23c03SJunchao Zhang PetscFunctionReturn(0); \ 28140e23c03SJunchao Zhang } 28240e23c03SJunchao Zhang 28340e23c03SJunchao Zhang 284b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */ 285b23bfdefSJunchao Zhang #define DEF_Pack(Type,BS,EQ) \ 286b23bfdefSJunchao Zhang DEF_PackFunc(Type,BS,EQ) \ 287cd620004SJunchao Zhang DEF_UnpackFunc(Type,BS,EQ) \ 288cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \ 289cd620004SJunchao Zhang static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \ 290eb02082bSJunchao Zhang link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \ 291eb02082bSJunchao Zhang link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ 292cd620004SJunchao Zhang link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ 29340e23c03SJunchao Zhang } 29440e23c03SJunchao Zhang 295b23bfdefSJunchao Zhang /* Add, Mult ops */ 296b23bfdefSJunchao Zhang #define DEF_Add(Type,BS,EQ) \ 297cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 298cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 299cd620004SJunchao Zhang DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 300cd620004SJunchao Zhang DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ 301cd620004SJunchao Zhang DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ 302cd620004SJunchao Zhang DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ 303cd620004SJunchao Zhang static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ 304eb02082bSJunchao Zhang link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ 305eb02082bSJunchao Zhang link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ 306eb02082bSJunchao Zhang link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ 307cd620004SJunchao Zhang link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ 308cd620004SJunchao Zhang link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ 309cd620004SJunchao Zhang link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ 31040e23c03SJunchao Zhang } 31140e23c03SJunchao Zhang 312b23bfdefSJunchao Zhang /* Max, Min ops */ 313b23bfdefSJunchao Zhang #define DEF_Cmp(Type,BS,EQ) \ 314cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 315cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 316cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ 317cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ 318cd620004SJunchao Zhang static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ 319eb02082bSJunchao Zhang link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 320eb02082bSJunchao Zhang link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 321cd620004SJunchao Zhang link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ 322cd620004SJunchao Zhang link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ 323b23bfdefSJunchao Zhang } 324b23bfdefSJunchao Zhang 325b23bfdefSJunchao Zhang /* Logical ops. 326cd620004SJunchao Zhang The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 32740e23c03SJunchao Zhang the compilation warning "empty macro arguments are undefined in ISO C90" 32840e23c03SJunchao Zhang */ 329b23bfdefSJunchao Zhang #define DEF_Log(Type,BS,EQ) \ 330cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ 331cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ 332cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ 333cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ 334cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ 335cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ 336cd620004SJunchao Zhang static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ 337eb02082bSJunchao Zhang link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ 338eb02082bSJunchao Zhang link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ 339eb02082bSJunchao Zhang link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ 340cd620004SJunchao Zhang link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ 341cd620004SJunchao Zhang link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ 342cd620004SJunchao Zhang link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ 34340e23c03SJunchao Zhang } 34440e23c03SJunchao Zhang 345b23bfdefSJunchao Zhang /* Bitwise ops */ 346b23bfdefSJunchao Zhang #define DEF_Bit(Type,BS,EQ) \ 347cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ 348cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ 349cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ 350cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ 351cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ 352cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ 353cd620004SJunchao Zhang static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ 354eb02082bSJunchao Zhang link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ 355eb02082bSJunchao Zhang link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ 356eb02082bSJunchao Zhang link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ 357cd620004SJunchao Zhang link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ 358cd620004SJunchao Zhang link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ 359cd620004SJunchao Zhang link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ 36040e23c03SJunchao Zhang } 36140e23c03SJunchao Zhang 362cd620004SJunchao Zhang /* Maxloc, Minloc ops */ 363cd620004SJunchao Zhang #define DEF_Xloc(Type,BS,EQ) \ 364cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ 365cd620004SJunchao Zhang DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ 366cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ 367cd620004SJunchao Zhang DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ 368cd620004SJunchao Zhang static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ 369cd620004SJunchao Zhang link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ 370cd620004SJunchao Zhang link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ 371cd620004SJunchao Zhang link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ 372cd620004SJunchao Zhang link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ 37340e23c03SJunchao Zhang } 37440e23c03SJunchao Zhang 375b23bfdefSJunchao Zhang #define DEF_IntegerType(Type,BS,EQ) \ 376b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 377b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 378b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 379b23bfdefSJunchao Zhang DEF_Log(Type,BS,EQ) \ 380b23bfdefSJunchao Zhang DEF_Bit(Type,BS,EQ) \ 381cd620004SJunchao Zhang static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ 382b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 383b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 384b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 385b23bfdefSJunchao Zhang CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ 386b23bfdefSJunchao Zhang CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ 38740e23c03SJunchao Zhang } 38840e23c03SJunchao Zhang 389b23bfdefSJunchao Zhang #define DEF_RealType(Type,BS,EQ) \ 390b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 391b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 392b23bfdefSJunchao Zhang DEF_Cmp(Type,BS,EQ) \ 393cd620004SJunchao Zhang static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ 394b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 395b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 396b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ 397b23bfdefSJunchao Zhang } 39840e23c03SJunchao Zhang 39940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 400b23bfdefSJunchao Zhang #define DEF_ComplexType(Type,BS,EQ) \ 401b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 402b23bfdefSJunchao Zhang DEF_Add(Type,BS,EQ) \ 403cd620004SJunchao Zhang static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ 404b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 405b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ 406b23bfdefSJunchao Zhang } 40740e23c03SJunchao Zhang #endif 408b23bfdefSJunchao Zhang 409b23bfdefSJunchao Zhang #define DEF_DumbType(Type,BS,EQ) \ 410b23bfdefSJunchao Zhang DEF_Pack(Type,BS,EQ) \ 411cd620004SJunchao Zhang static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ 412b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 413b23bfdefSJunchao Zhang } 414b23bfdefSJunchao Zhang 415b23bfdefSJunchao Zhang /* Maxloc, Minloc */ 416cd620004SJunchao Zhang #define DEF_PairType(Type,BS,EQ) \ 417cd620004SJunchao Zhang DEF_Pack(Type,BS,EQ) \ 418cd620004SJunchao Zhang DEF_Xloc(Type,BS,EQ) \ 419cd620004SJunchao Zhang static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ 420cd620004SJunchao Zhang CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ 421cd620004SJunchao Zhang CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ 422b23bfdefSJunchao Zhang } 423b23bfdefSJunchao Zhang 424b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ 425b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ 426b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ 427b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ 428b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ 429b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ 430b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ 431b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 432b23bfdefSJunchao Zhang 433b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 434b23bfdefSJunchao Zhang DEF_IntegerType(int,1,1) 435b23bfdefSJunchao Zhang DEF_IntegerType(int,2,1) 436b23bfdefSJunchao Zhang DEF_IntegerType(int,4,1) 437b23bfdefSJunchao Zhang DEF_IntegerType(int,8,1) 438b23bfdefSJunchao Zhang DEF_IntegerType(int,1,0) 439b23bfdefSJunchao Zhang DEF_IntegerType(int,2,0) 440b23bfdefSJunchao Zhang DEF_IntegerType(int,4,0) 441b23bfdefSJunchao Zhang DEF_IntegerType(int,8,0) 442b23bfdefSJunchao Zhang #endif 443b23bfdefSJunchao Zhang 444b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */ 445b23bfdefSJunchao Zhang typedef signed char SignedChar; 446b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,1) 447b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,1) 448b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,1) 449b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,1) 450b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,1,0) 451b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,2,0) 452b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,4,0) 453b23bfdefSJunchao Zhang DEF_IntegerType(SignedChar,8,0) 454b23bfdefSJunchao Zhang 455b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar; 456b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,1) 457b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,1) 458b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,1) 459b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,1) 460b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,1,0) 461b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,2,0) 462b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,4,0) 463b23bfdefSJunchao Zhang DEF_IntegerType(UnsignedChar,8,0) 464b23bfdefSJunchao Zhang 465b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,1) 466b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,1) 467b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,1) 468b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,1) 469b23bfdefSJunchao Zhang DEF_RealType(PetscReal,1,0) 470b23bfdefSJunchao Zhang DEF_RealType(PetscReal,2,0) 471b23bfdefSJunchao Zhang DEF_RealType(PetscReal,4,0) 472b23bfdefSJunchao Zhang DEF_RealType(PetscReal,8,0) 473b23bfdefSJunchao Zhang 474b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 475b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,1) 476b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,1) 477b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,1) 478b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,1) 479b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,1,0) 480b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,2,0) 481b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,4,0) 482b23bfdefSJunchao Zhang DEF_ComplexType(PetscComplex,8,0) 483b23bfdefSJunchao Zhang #endif 484b23bfdefSJunchao Zhang 485cd620004SJunchao Zhang #define PairType(Type1,Type2) Type1##_##Type2 486cd620004SJunchao Zhang typedef struct {int u; int i;} PairType(int,int); 487cd620004SJunchao Zhang typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); 488cd620004SJunchao Zhang DEF_PairType(PairType(int,int),1,1) 489cd620004SJunchao Zhang DEF_PairType(PairType(PetscInt,PetscInt),1,1) 490b23bfdefSJunchao Zhang 491b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */ 492b23bfdefSJunchao Zhang DEF_DumbType(char,1,1) 493b23bfdefSJunchao Zhang DEF_DumbType(char,2,1) 494b23bfdefSJunchao Zhang DEF_DumbType(char,4,1) 495b23bfdefSJunchao Zhang DEF_DumbType(char,1,0) 496b23bfdefSJunchao Zhang DEF_DumbType(char,2,0) 497b23bfdefSJunchao Zhang DEF_DumbType(char,4,0) 498b23bfdefSJunchao Zhang 499eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 500b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,1) 501b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,1) 502b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,1) 503b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,1) 504b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,1,0) 505b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,2,0) 506b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,4,0) 507b23bfdefSJunchao Zhang DEF_DumbType(DumbInt,8,0) 50840e23c03SJunchao Zhang 50940e23c03SJunchao Zhang #if !defined(PETSC_HAVE_MPI_TYPE_DUP) 51040e23c03SJunchao Zhang PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) 51140e23c03SJunchao Zhang { 51240e23c03SJunchao Zhang int ierr; 51340e23c03SJunchao Zhang ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; 51440e23c03SJunchao Zhang ierr = MPI_Type_commit(newtype); if (ierr) return ierr; 51540e23c03SJunchao Zhang return MPI_SUCCESS; 51640e23c03SJunchao Zhang } 51740e23c03SJunchao Zhang #endif 51840e23c03SJunchao Zhang 519cd620004SJunchao Zhang /* 520cd620004SJunchao Zhang The routine Creates a communication link for the given operation. It first looks up its link cache. If 521cd620004SJunchao Zhang there is a free & suitable one, it uses it. Otherwise it creates a new one. 522cd620004SJunchao Zhang 523cd620004SJunchao Zhang A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 524cd620004SJunchao Zhang root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 525cd620004SJunchao Zhang can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 526cd620004SJunchao Zhang those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 527cd620004SJunchao Zhang 528cd620004SJunchao Zhang The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 529cd620004SJunchao Zhang 530cd620004SJunchao Zhang In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 531cd620004SJunchao Zhang 532cd620004SJunchao Zhang The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 533cd620004SJunchao Zhang need pack/unpack data. 534cd620004SJunchao Zhang */ 535cd620004SJunchao Zhang PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) 53640e23c03SJunchao Zhang { 53740e23c03SJunchao Zhang PetscErrorCode ierr; 538cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 539cd620004SJunchao Zhang PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; 540cd620004SJunchao Zhang PetscSFLink *p,link; 541cd620004SJunchao Zhang PetscSFDirection direction; 542cd620004SJunchao Zhang MPI_Request *reqs = NULL; 543cd620004SJunchao Zhang PetscBool match,rootdirect[2],leafdirect[2]; 544cd620004SJunchao Zhang PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ 545cd620004SJunchao Zhang PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ 546cd620004SJunchao Zhang 547cd620004SJunchao Zhang PetscFunctionBegin; 548cd620004SJunchao Zhang ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); 549cd620004SJunchao Zhang 550cd620004SJunchao Zhang /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 551cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 552cd620004SJunchao Zhang if (sfop == PETSCSF_BCAST) { 553cd620004SJunchao Zhang rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 554cd620004SJunchao Zhang leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 555cd620004SJunchao Zhang } else if (sfop == PETSCSF_REDUCE) { 556cd620004SJunchao Zhang leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 557cd620004SJunchao Zhang rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 558cd620004SJunchao Zhang } else { /* PETSCSF_FETCH */ 559cd620004SJunchao Zhang rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 560cd620004SJunchao Zhang leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 561cd620004SJunchao Zhang } 562cd620004SJunchao Zhang } 563cd620004SJunchao Zhang 564cd620004SJunchao Zhang if (use_gpu_aware_mpi) { 565cd620004SJunchao Zhang rootmtype_mpi = rootmtype; 566cd620004SJunchao Zhang leafmtype_mpi = leafmtype; 567cd620004SJunchao Zhang } else { 568cd620004SJunchao Zhang rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 569cd620004SJunchao Zhang } 570cd620004SJunchao Zhang /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ 571cd620004SJunchao Zhang rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; 572cd620004SJunchao Zhang leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; 573cd620004SJunchao Zhang 574cd620004SJunchao Zhang direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 575cd620004SJunchao Zhang nrootreqs = bas->nrootreqs; 576cd620004SJunchao Zhang nleafreqs = sf->nleafreqs; 577cd620004SJunchao Zhang 578cd620004SJunchao Zhang /* Look for free links in cache */ 579cd620004SJunchao Zhang for (p=&bas->avail; (link=*p); p=&link->next) { 580cd620004SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 581cd620004SJunchao Zhang if (match) { 582cd620004SJunchao Zhang /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. 583cd620004SJunchao Zhang If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). 584cd620004SJunchao Zhang */ 585cd620004SJunchao Zhang if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 586cd620004SJunchao Zhang reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 587cd620004SJunchao Zhang for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 588cd620004SJunchao Zhang link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 589cd620004SJunchao Zhang } 590cd620004SJunchao Zhang if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 591cd620004SJunchao Zhang reqs = link->leafreqs[direction][leafmtype][1]; 592cd620004SJunchao Zhang for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&reqs[i]);CHKERRQ(ierr);}} 593cd620004SJunchao Zhang link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 594cd620004SJunchao Zhang } 595cd620004SJunchao Zhang *p = link->next; /* Remove from available list */ 596cd620004SJunchao Zhang goto found; 597cd620004SJunchao Zhang } 598cd620004SJunchao Zhang } 599cd620004SJunchao Zhang 600cd620004SJunchao Zhang ierr = PetscNew(&link);CHKERRQ(ierr); 601cd620004SJunchao Zhang ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); 602cd620004SJunchao Zhang ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ 603cd620004SJunchao Zhang 604cd620004SJunchao Zhang nreqs = (nrootreqs+nleafreqs)*8; 605cd620004SJunchao Zhang ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); 606cd620004SJunchao Zhang for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 607cd620004SJunchao Zhang 608cd620004SJunchao Zhang for (i=0; i<2; i++) { /* Two communication directions */ 609cd620004SJunchao Zhang for (j=0; j<2; j++) { /* Two memory types */ 610cd620004SJunchao Zhang for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ 611cd620004SJunchao Zhang link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); 612cd620004SJunchao Zhang link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); 613cd620004SJunchao Zhang } 614cd620004SJunchao Zhang } 615cd620004SJunchao Zhang } 616cd620004SJunchao Zhang 617cd620004SJunchao Zhang found: 618cd620004SJunchao Zhang if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {ierr = PetscSFLinkSetUp_Device(sf,link,unit);CHKERRQ(ierr);} 619cd620004SJunchao Zhang 620cd620004SJunchao Zhang /* Allocate buffers along root/leafdata */ 621cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 622cd620004SJunchao Zhang /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 623cd620004SJunchao Zhang if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 624cd620004SJunchao Zhang if (bas->rootbuflen[i]) { 625cd620004SJunchao Zhang if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 626cd620004SJunchao Zhang link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; 627cd620004SJunchao Zhang } else { /* Have to have a separate rootbuf */ 628cd620004SJunchao Zhang if (!link->rootbuf_alloc[i][rootmtype]) { 629cd620004SJunchao Zhang ierr = PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); 630cd620004SJunchao Zhang } 631cd620004SJunchao Zhang link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 632cd620004SJunchao Zhang } 633cd620004SJunchao Zhang } 634cd620004SJunchao Zhang 635cd620004SJunchao Zhang if (sf->leafbuflen[i]) { 636cd620004SJunchao Zhang if (leafdirect[i]) { 637cd620004SJunchao Zhang link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; 638cd620004SJunchao Zhang } else { 639cd620004SJunchao Zhang if (!link->leafbuf_alloc[i][leafmtype]) { 640cd620004SJunchao Zhang ierr = PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); 641cd620004SJunchao Zhang } 642cd620004SJunchao Zhang link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 643cd620004SJunchao Zhang } 644cd620004SJunchao Zhang } 645cd620004SJunchao Zhang } 646cd620004SJunchao Zhang 647cd620004SJunchao Zhang /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 648cd620004SJunchao Zhang if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { 649cd620004SJunchao Zhang if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 650cd620004SJunchao Zhang ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 651cd620004SJunchao Zhang } 652cd620004SJunchao Zhang link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 653cd620004SJunchao Zhang } 654cd620004SJunchao Zhang if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { 655cd620004SJunchao Zhang if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { 656cd620004SJunchao Zhang ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 657cd620004SJunchao Zhang } 658cd620004SJunchao Zhang link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 659cd620004SJunchao Zhang } 660cd620004SJunchao Zhang 661cd620004SJunchao Zhang /* Set `current` state of the link. They may change between different SF invocations with the same link */ 662cd620004SJunchao Zhang if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 663cd620004SJunchao Zhang if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 664cd620004SJunchao Zhang if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 665cd620004SJunchao Zhang } 666cd620004SJunchao Zhang 667cd620004SJunchao Zhang link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 668cd620004SJunchao Zhang link->leafdata = leafdata; 669cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 670cd620004SJunchao Zhang link->rootdirect[i] = rootdirect[i]; 671cd620004SJunchao Zhang link->leafdirect[i] = leafdirect[i]; 672cd620004SJunchao Zhang } 673cd620004SJunchao Zhang link->rootdirect_mpi = rootdirect_mpi; 674cd620004SJunchao Zhang link->leafdirect_mpi = leafdirect_mpi; 675cd620004SJunchao Zhang link->rootmtype = rootmtype; 676cd620004SJunchao Zhang link->leafmtype = leafmtype; 677cd620004SJunchao Zhang link->rootmtype_mpi = rootmtype_mpi; 678cd620004SJunchao Zhang link->leafmtype_mpi = leafmtype_mpi; 679cd620004SJunchao Zhang 680cd620004SJunchao Zhang link->next = bas->inuse; 681cd620004SJunchao Zhang bas->inuse = link; 682cd620004SJunchao Zhang *mylink = link; 683cd620004SJunchao Zhang PetscFunctionReturn(0); 684cd620004SJunchao Zhang } 685cd620004SJunchao Zhang 686cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 687cd620004SJunchao Zhang If the sf uses persistent requests and the requests have not been initialized, then initialize them. 688cd620004SJunchao Zhang */ 689cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) 690cd620004SJunchao Zhang { 691cd620004SJunchao Zhang PetscErrorCode ierr; 692cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 693cd620004SJunchao Zhang PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; 694cd620004SJunchao Zhang const PetscInt *rootoffset,*leafoffset; 695cd620004SJunchao Zhang PetscMPIInt n; 696cd620004SJunchao Zhang MPI_Aint disp; 697cd620004SJunchao Zhang MPI_Comm comm = PetscObjectComm((PetscObject)sf); 698cd620004SJunchao Zhang MPI_Datatype unit = link->unit; 699cd620004SJunchao Zhang const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 700cd620004SJunchao Zhang const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 701cd620004SJunchao Zhang 702cd620004SJunchao Zhang PetscFunctionBegin; 703cd620004SJunchao Zhang /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 704cd620004SJunchao Zhang if (sf->persistent) { 705cd620004SJunchao Zhang if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 706cd620004SJunchao Zhang ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); 707cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 708cd620004SJunchao Zhang for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 709cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 710cd620004SJunchao Zhang ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 711cd620004SJunchao Zhang ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 712cd620004SJunchao Zhang } 713cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 714cd620004SJunchao Zhang for (i=ndrootranks,j=0; i<nrootranks; i++,j++) { 715cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes; 716cd620004SJunchao Zhang ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); 717cd620004SJunchao Zhang ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRQ(ierr); 718cd620004SJunchao Zhang } 719cd620004SJunchao Zhang } 720cd620004SJunchao Zhang link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 721cd620004SJunchao Zhang } 722cd620004SJunchao Zhang 723cd620004SJunchao Zhang if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 724cd620004SJunchao Zhang ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); 725cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 726cd620004SJunchao Zhang for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 727cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 728cd620004SJunchao Zhang ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 729cd620004SJunchao Zhang ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 730cd620004SJunchao Zhang } 731cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 732cd620004SJunchao Zhang for (i=ndleafranks,j=0; i<nleafranks; i++,j++) { 733cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes; 734cd620004SJunchao Zhang ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); 735cd620004SJunchao Zhang ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRQ(ierr); 736cd620004SJunchao Zhang } 737cd620004SJunchao Zhang } 738cd620004SJunchao Zhang link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 739cd620004SJunchao Zhang } 740cd620004SJunchao Zhang } 741cd620004SJunchao Zhang if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 742cd620004SJunchao Zhang if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 743cd620004SJunchao Zhang if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 744cd620004SJunchao Zhang if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 745cd620004SJunchao Zhang PetscFunctionReturn(0); 746cd620004SJunchao Zhang } 747cd620004SJunchao Zhang 748cd620004SJunchao Zhang 749cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) 750cd620004SJunchao Zhang { 751cd620004SJunchao Zhang PetscErrorCode ierr; 752cd620004SJunchao Zhang PetscSFLink link,*p; 75340e23c03SJunchao Zhang PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; 75440e23c03SJunchao Zhang 75540e23c03SJunchao Zhang PetscFunctionBegin; 75640e23c03SJunchao Zhang /* Look for types in cache */ 75740e23c03SJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 75840e23c03SJunchao Zhang PetscBool match; 75940e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 760637e6665SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 76140e23c03SJunchao Zhang switch (cmode) { 76240e23c03SJunchao Zhang case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 76340e23c03SJunchao Zhang case PETSC_USE_POINTER: break; 76440e23c03SJunchao Zhang default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); 76540e23c03SJunchao Zhang } 76640e23c03SJunchao Zhang *mylink = link; 76740e23c03SJunchao Zhang PetscFunctionReturn(0); 76840e23c03SJunchao Zhang } 76940e23c03SJunchao Zhang } 77040e23c03SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); 77140e23c03SJunchao Zhang PetscFunctionReturn(0); 77240e23c03SJunchao Zhang } 77340e23c03SJunchao Zhang 774cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) 77540e23c03SJunchao Zhang { 77640e23c03SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 77740e23c03SJunchao Zhang 77840e23c03SJunchao Zhang PetscFunctionBegin; 779637e6665SJunchao Zhang (*link)->rootdata = NULL; 780637e6665SJunchao Zhang (*link)->leafdata = NULL; 78140e23c03SJunchao Zhang (*link)->next = bas->avail; 78240e23c03SJunchao Zhang bas->avail = *link; 78340e23c03SJunchao Zhang *link = NULL; 78440e23c03SJunchao Zhang PetscFunctionReturn(0); 78540e23c03SJunchao Zhang } 78640e23c03SJunchao Zhang 787cd620004SJunchao Zhang /* Destroy all links chained in 'avail' */ 788cd620004SJunchao Zhang PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) 789eb02082bSJunchao Zhang { 790eb02082bSJunchao Zhang PetscErrorCode ierr; 791cd620004SJunchao Zhang PetscSFLink link = *avail,next; 792cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 793cd620004SJunchao Zhang PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; 794eb02082bSJunchao Zhang 795eb02082bSJunchao Zhang PetscFunctionBegin; 796eb02082bSJunchao Zhang for (; link; link=next) { 797eb02082bSJunchao Zhang next = link->next; 798eb02082bSJunchao Zhang if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRQ(ierr);} 799cd620004SJunchao Zhang for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */ 800eb02082bSJunchao Zhang if (link->reqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRQ(ierr);} 801eb02082bSJunchao Zhang } 802eb02082bSJunchao Zhang ierr = PetscFree(link->reqs);CHKERRQ(ierr); 803cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 80451ccb202SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 805cd620004SJunchao Zhang ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 806cd620004SJunchao Zhang ierr = PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); 807eb02082bSJunchao Zhang if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;} 808eb02082bSJunchao Zhang #endif 809cd620004SJunchao Zhang ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 810cd620004SJunchao Zhang ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); 811cd620004SJunchao Zhang } 812eb02082bSJunchao Zhang ierr = PetscFree(link);CHKERRQ(ierr); 813eb02082bSJunchao Zhang } 814eb02082bSJunchao Zhang *avail = NULL; 815eb02082bSJunchao Zhang PetscFunctionReturn(0); 816eb02082bSJunchao Zhang } 817eb02082bSJunchao Zhang 818cd620004SJunchao Zhang #if defined(PETSC_USE_DEBUG) 8199d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */ 820cd620004SJunchao Zhang PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) 8219d1c8addSJunchao Zhang { 8229d1c8addSJunchao Zhang PetscErrorCode ierr; 823cd620004SJunchao Zhang PetscSFLink link,*p; 8249d1c8addSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 8259d1c8addSJunchao Zhang PetscBool match; 8269d1c8addSJunchao Zhang 8279d1c8addSJunchao Zhang PetscFunctionBegin; 82818fb5014SJunchao Zhang /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 82918fb5014SJunchao Zhang the potential overlapping since this process does not participate in communication. Overlapping is harmless. 83018fb5014SJunchao Zhang */ 831637e6665SJunchao Zhang if (rootdata || leafdata) { 8329d1c8addSJunchao Zhang for (p=&bas->inuse; (link=*p); p=&link->next) { 8339d1c8addSJunchao Zhang ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); 834cd620004SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); 8359d1c8addSJunchao Zhang } 83618fb5014SJunchao Zhang } 8379d1c8addSJunchao Zhang PetscFunctionReturn(0); 8389d1c8addSJunchao Zhang } 839cd620004SJunchao Zhang #endif 8409d1c8addSJunchao Zhang 841cd620004SJunchao Zhang PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) 84240e23c03SJunchao Zhang { 84340e23c03SJunchao Zhang PetscErrorCode ierr; 844b23bfdefSJunchao Zhang PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; 845b23bfdefSJunchao Zhang PetscBool is2Int,is2PetscInt; 84640e23c03SJunchao Zhang PetscMPIInt ni,na,nd,combiner; 84740e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 848b23bfdefSJunchao Zhang PetscInt nPetscComplex=0; 84940e23c03SJunchao Zhang #endif 85040e23c03SJunchao Zhang 85140e23c03SJunchao Zhang PetscFunctionBegin; 852b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); 853b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); 854b23bfdefSJunchao Zhang /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 855b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); 856b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); 857b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); 85840e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 859b23bfdefSJunchao Zhang ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); 86040e23c03SJunchao Zhang #endif 86140e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); 86240e23c03SJunchao Zhang ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); 863b23bfdefSJunchao Zhang /* TODO: shaell we also handle Fortran MPI_2REAL? */ 86440e23c03SJunchao Zhang ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRQ(ierr); 8655ad15460SJunchao Zhang link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 866b23bfdefSJunchao Zhang link->bs = 1; /* default */ 86740e23c03SJunchao Zhang 868eb02082bSJunchao Zhang if (is2Int) { 869cd620004SJunchao Zhang PackInit_PairType_int_int_1_1(link); 870eb02082bSJunchao Zhang link->bs = 1; 871eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(int); 8725ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 873eb02082bSJunchao Zhang link->basicunit = MPI_2INT; 8745ad15460SJunchao Zhang link->unit = MPI_2INT; 875eb02082bSJunchao Zhang } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 876cd620004SJunchao Zhang PackInit_PairType_PetscInt_PetscInt_1_1(link); 877eb02082bSJunchao Zhang link->bs = 1; 878eb02082bSJunchao Zhang link->unitbytes = 2*sizeof(PetscInt); 879eb02082bSJunchao Zhang link->basicunit = MPIU_2INT; 8805ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 8815ad15460SJunchao Zhang link->unit = MPIU_2INT; 882eb02082bSJunchao Zhang } else if (nPetscReal) { 883b23bfdefSJunchao Zhang if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); 884b23bfdefSJunchao Zhang else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); 885b23bfdefSJunchao Zhang else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); 886b23bfdefSJunchao Zhang else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); 887b23bfdefSJunchao Zhang link->bs = nPetscReal; 888eb02082bSJunchao Zhang link->unitbytes = nPetscReal*sizeof(PetscReal); 88940e23c03SJunchao Zhang link->basicunit = MPIU_REAL; 8905ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} 891b23bfdefSJunchao Zhang } else if (nPetscInt) { 892b23bfdefSJunchao Zhang if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 893b23bfdefSJunchao Zhang else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 894b23bfdefSJunchao Zhang else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 895b23bfdefSJunchao Zhang else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 896b23bfdefSJunchao Zhang link->bs = nPetscInt; 897eb02082bSJunchao Zhang link->unitbytes = nPetscInt*sizeof(PetscInt); 898b23bfdefSJunchao Zhang link->basicunit = MPIU_INT; 8995ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} 900b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) 901b23bfdefSJunchao Zhang } else if (nInt) { 902b23bfdefSJunchao Zhang if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); 903b23bfdefSJunchao Zhang else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); 904b23bfdefSJunchao Zhang else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); 905b23bfdefSJunchao Zhang else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); 906b23bfdefSJunchao Zhang link->bs = nInt; 907eb02082bSJunchao Zhang link->unitbytes = nInt*sizeof(int); 908b23bfdefSJunchao Zhang link->basicunit = MPI_INT; 9095ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} 910b23bfdefSJunchao Zhang #endif 911b23bfdefSJunchao Zhang } else if (nSignedChar) { 912b23bfdefSJunchao Zhang if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 913b23bfdefSJunchao Zhang else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 914b23bfdefSJunchao Zhang else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 915b23bfdefSJunchao Zhang else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 916b23bfdefSJunchao Zhang link->bs = nSignedChar; 917eb02082bSJunchao Zhang link->unitbytes = nSignedChar*sizeof(SignedChar); 918b23bfdefSJunchao Zhang link->basicunit = MPI_SIGNED_CHAR; 9195ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} 920b23bfdefSJunchao Zhang } else if (nUnsignedChar) { 921b23bfdefSJunchao Zhang if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 922b23bfdefSJunchao Zhang else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 923b23bfdefSJunchao Zhang else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 924b23bfdefSJunchao Zhang else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 925b23bfdefSJunchao Zhang link->bs = nUnsignedChar; 926eb02082bSJunchao Zhang link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); 927b23bfdefSJunchao Zhang link->basicunit = MPI_UNSIGNED_CHAR; 9285ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} 92940e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 930b23bfdefSJunchao Zhang } else if (nPetscComplex) { 931b23bfdefSJunchao Zhang if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 932b23bfdefSJunchao Zhang else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 933b23bfdefSJunchao Zhang else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 934b23bfdefSJunchao Zhang else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 935b23bfdefSJunchao Zhang link->bs = nPetscComplex; 936eb02082bSJunchao Zhang link->unitbytes = nPetscComplex*sizeof(PetscComplex); 93740e23c03SJunchao Zhang link->basicunit = MPIU_COMPLEX; 9385ad15460SJunchao Zhang if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} 93940e23c03SJunchao Zhang #endif 94040e23c03SJunchao Zhang } else { 941b23bfdefSJunchao Zhang MPI_Aint lb,nbyte; 942b23bfdefSJunchao Zhang ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRQ(ierr); 94340e23c03SJunchao Zhang if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); 944eb02082bSJunchao Zhang if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 945eb02082bSJunchao Zhang if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); 946eb02082bSJunchao Zhang else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); 947eb02082bSJunchao Zhang else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); 948eb02082bSJunchao Zhang link->bs = nbyte; 949b23bfdefSJunchao Zhang link->unitbytes = nbyte; 950b23bfdefSJunchao Zhang link->basicunit = MPI_BYTE; 95140e23c03SJunchao Zhang } else { 952eb02082bSJunchao Zhang nInt = nbyte / sizeof(int); 953eb02082bSJunchao Zhang if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); 954eb02082bSJunchao Zhang else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); 955eb02082bSJunchao Zhang else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); 956eb02082bSJunchao Zhang else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); 957eb02082bSJunchao Zhang link->bs = nInt; 958b23bfdefSJunchao Zhang link->unitbytes = nbyte; 95940e23c03SJunchao Zhang link->basicunit = MPI_INT; 96040e23c03SJunchao Zhang } 9615ad15460SJunchao Zhang if (link->isbuiltin) link->unit = unit; 96240e23c03SJunchao Zhang } 963b23bfdefSJunchao Zhang 9645ad15460SJunchao Zhang if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRQ(ierr);} 96540e23c03SJunchao Zhang PetscFunctionReturn(0); 96640e23c03SJunchao Zhang } 96740e23c03SJunchao Zhang 968cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*)) 96940e23c03SJunchao Zhang { 97040e23c03SJunchao Zhang PetscFunctionBegin; 97140e23c03SJunchao Zhang *UnpackAndOp = NULL; 972eb02082bSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) { 973eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 974eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 975eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 976eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 977eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 978eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 979eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 980eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 981eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 982eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 983eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 984eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 985eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 986eb02082bSJunchao Zhang } 987eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 988eb02082bSJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 989eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 990eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 991eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 992eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 993eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 994eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 995eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 996eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 997eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 998eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 999eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 1000eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 1001eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 1002eb02082bSJunchao Zhang } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1003eb02082bSJunchao Zhang if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 1004eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 1005eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 1006eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 1007eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 1008eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 1009eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 1010eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 1011eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 1012eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 1013eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 1014eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 1015eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 1016eb02082bSJunchao Zhang } 1017eb02082bSJunchao Zhang #endif 101840e23c03SJunchao Zhang PetscFunctionReturn(0); 101940e23c03SJunchao Zhang } 102040e23c03SJunchao Zhang 1021cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*)) 102240e23c03SJunchao Zhang { 102340e23c03SJunchao Zhang PetscFunctionBegin; 1024cd620004SJunchao Zhang *ScatterAndOp = NULL; 1025eb02082bSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) { 1026cd620004SJunchao Zhang if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 1027cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 1028cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 1029cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 1030cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 1031cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 1032cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 1033cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 1034cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 1035cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 1036cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 1037cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 1038cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 1039eb02082bSJunchao Zhang } 1040eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1041eb02082bSJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { 1042cd620004SJunchao Zhang if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 1043cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 1044cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 1045cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 1046cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 1047cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 1048cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 1049cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 1050cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 1051cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 1052cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 1053cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 1054cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 1055eb02082bSJunchao Zhang } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { 1056cd620004SJunchao Zhang if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 1057cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 1058cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 1059cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 1060cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 1061cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 1062cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 1063cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 1064cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 1065cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 1066cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 1067cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 1068cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 1069eb02082bSJunchao Zhang } 1070eb02082bSJunchao Zhang #endif 1071cd620004SJunchao Zhang PetscFunctionReturn(0); 1072cd620004SJunchao Zhang } 1073cd620004SJunchao Zhang 1074cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*)) 1075cd620004SJunchao Zhang { 1076cd620004SJunchao Zhang PetscFunctionBegin; 1077cd620004SJunchao Zhang *FetchAndOp = NULL; 1078cd620004SJunchao Zhang if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1079cd620004SJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; 1080cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1081cd620004SJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; 1082cd620004SJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; 1083cd620004SJunchao Zhang #endif 1084cd620004SJunchao Zhang PetscFunctionReturn(0); 1085cd620004SJunchao Zhang } 1086cd620004SJunchao Zhang 1087cd620004SJunchao Zhang PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*)) 1088cd620004SJunchao Zhang { 1089cd620004SJunchao Zhang PetscFunctionBegin; 1090cd620004SJunchao Zhang *FetchAndOpLocal = NULL; 1091cd620004SJunchao Zhang if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); 1092cd620004SJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; 1093cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1094cd620004SJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 1095cd620004SJunchao Zhang else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 1096cd620004SJunchao Zhang #endif 1097cd620004SJunchao Zhang PetscFunctionReturn(0); 1098cd620004SJunchao Zhang } 1099cd620004SJunchao Zhang 1100cd620004SJunchao Zhang /*============================================================================= 1101cd620004SJunchao Zhang A set of helper routines for Pack/Unpack/Scatter on GPUs 1102cd620004SJunchao Zhang ============================================================================*/ 1103cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1104cd620004SJunchao Zhang /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to 1105cd620004SJunchao Zhang make sure the data is ready for packing. 1106cd620004SJunchao Zhang */ 1107cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link) 1108cd620004SJunchao Zhang { 1109cd620004SJunchao Zhang PetscFunctionBegin; 1110cd620004SJunchao Zhang if (sf->use_default_stream) PetscFunctionReturn(0); 1111cd620004SJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) { 1112cd620004SJunchao Zhang cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr); 1113cd620004SJunchao Zhang } 1114cd620004SJunchao Zhang PetscFunctionReturn(0); 1115cd620004SJunchao Zhang } 1116cd620004SJunchao Zhang 1117cd620004SJunchao Zhang /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */ 1118cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link) 1119cd620004SJunchao Zhang { 1120cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1121cd620004SJunchao Zhang 1122cd620004SJunchao Zhang PetscFunctionBegin; 1123b85e67b7SJunchao Zhang /* Do nothing if we use stream aware mpi || has nothing for remote */ 1124b85e67b7SJunchao Zhang if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 1125b85e67b7SJunchao Zhang /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */ 1126b85e67b7SJunchao Zhang if (!link->rootdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi || sf->use_default_stream) { 1127cd620004SJunchao Zhang cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1128cd620004SJunchao Zhang } 1129cd620004SJunchao Zhang PetscFunctionReturn(0); 1130cd620004SJunchao Zhang } 1131cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link) 1132cd620004SJunchao Zhang { 1133cd620004SJunchao Zhang PetscFunctionBegin; 1134b85e67b7SJunchao Zhang /* See comments above */ 1135b85e67b7SJunchao Zhang if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 1136b85e67b7SJunchao Zhang if (!link->leafdirect[PETSCSF_REMOTE] || !use_gpu_aware_mpi || sf->use_default_stream) { 1137cd620004SJunchao Zhang cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1138cd620004SJunchao Zhang } 1139cd620004SJunchao Zhang PetscFunctionReturn(0); 1140cd620004SJunchao Zhang } 1141cd620004SJunchao Zhang 1142cd620004SJunchao Zhang /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF 1143cd620004SJunchao Zhang does not know which stream the callers will use. 1144cd620004SJunchao Zhang */ 1145cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link) 1146cd620004SJunchao Zhang { 1147cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1148cd620004SJunchao Zhang 1149cd620004SJunchao Zhang PetscFunctionBegin; 1150b85e67b7SJunchao Zhang /* Do nothing if we are expected to put rootdata on default stream */ 1151b85e67b7SJunchao Zhang if (sf->use_default_stream || link->rootmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0); 1152b85e67b7SJunchao Zhang /* If we have something from local, then we called a scatter kernel (on link->stream), then we must sync it; 1153b85e67b7SJunchao Zhang If we have something from remote and we called unpack kernel, then we must also sycn it. 1154b85e67b7SJunchao Zhang */ 1155b85e67b7SJunchao Zhang if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) { 1156cd620004SJunchao Zhang cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1157cd620004SJunchao Zhang } 1158cd620004SJunchao Zhang PetscFunctionReturn(0); 1159cd620004SJunchao Zhang } 1160cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link) 1161cd620004SJunchao Zhang { 1162cd620004SJunchao Zhang PetscFunctionBegin; 1163b85e67b7SJunchao Zhang /* See comments above */ 1164b85e67b7SJunchao Zhang if (sf->use_default_stream || link->leafmtype != PETSC_MEMTYPE_DEVICE) PetscFunctionReturn(0); 1165b85e67b7SJunchao Zhang if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) { 1166cd620004SJunchao Zhang cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr); 1167cd620004SJunchao Zhang } 1168cd620004SJunchao Zhang PetscFunctionReturn(0); 1169cd620004SJunchao Zhang } 1170cd620004SJunchao Zhang 1171cd620004SJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 1172cd620004SJunchao Zhang to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 1173cd620004SJunchao Zhang */ 1174cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1175cd620004SJunchao Zhang { 1176cd620004SJunchao Zhang PetscErrorCode ierr; 1177cd620004SJunchao Zhang cudaError_t cerr; 1178cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1179cd620004SJunchao Zhang 1180cd620004SJunchao Zhang PetscFunctionBegin; 1181cd620004SJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) { 1182cd620004SJunchao Zhang void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1183cd620004SJunchao Zhang void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1184cd620004SJunchao Zhang size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes; 1185cd620004SJunchao Zhang if (device2host) { 1186cd620004SJunchao Zhang cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1187cd620004SJunchao Zhang ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1188cd620004SJunchao Zhang } else { 1189cd620004SJunchao Zhang cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1190cd620004SJunchao Zhang ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1191cd620004SJunchao Zhang } 1192cd620004SJunchao Zhang } 1193cd620004SJunchao Zhang PetscFunctionReturn(0); 1194cd620004SJunchao Zhang } 1195cd620004SJunchao Zhang 1196cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 1197cd620004SJunchao Zhang { 1198cd620004SJunchao Zhang PetscErrorCode ierr; 1199cd620004SJunchao Zhang cudaError_t cerr; 1200cd620004SJunchao Zhang 1201cd620004SJunchao Zhang PetscFunctionBegin; 1202cd620004SJunchao Zhang if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) { 1203cd620004SJunchao Zhang void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 1204cd620004SJunchao Zhang void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 1205cd620004SJunchao Zhang size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes; 1206cd620004SJunchao Zhang if (device2host) { 1207cd620004SJunchao Zhang cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr); 1208cd620004SJunchao Zhang ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 1209cd620004SJunchao Zhang } else { 1210cd620004SJunchao Zhang cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 1211cd620004SJunchao Zhang ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 1212cd620004SJunchao Zhang } 1213cd620004SJunchao Zhang } 1214cd620004SJunchao Zhang PetscFunctionReturn(0); 1215cd620004SJunchao Zhang } 1216cd620004SJunchao Zhang #else 1217cd620004SJunchao Zhang 1218cd620004SJunchao Zhang #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0 1219cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0 1220cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0 1221cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0 1222cd620004SJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0 1223cd620004SJunchao Zhang #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1224cd620004SJunchao Zhang #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 1225cd620004SJunchao Zhang 1226cd620004SJunchao Zhang #endif 1227cd620004SJunchao Zhang 1228cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1229cd620004SJunchao Zhang { 1230cd620004SJunchao Zhang PetscErrorCode ierr; 1231cd620004SJunchao Zhang PetscLogDouble flops; 1232cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1233cd620004SJunchao Zhang 1234cd620004SJunchao Zhang 1235cd620004SJunchao Zhang PetscFunctionBegin; 1236cd620004SJunchao Zhang if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1237cd620004SJunchao Zhang flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1238cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1239cd620004SJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1240cd620004SJunchao Zhang #endif 1241cd620004SJunchao Zhang {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1242cd620004SJunchao Zhang } 1243cd620004SJunchao Zhang PetscFunctionReturn(0); 1244cd620004SJunchao Zhang } 1245cd620004SJunchao Zhang 1246cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) 1247cd620004SJunchao Zhang { 1248cd620004SJunchao Zhang PetscLogDouble flops; 1249cd620004SJunchao Zhang PetscErrorCode ierr; 1250cd620004SJunchao Zhang 1251cd620004SJunchao Zhang PetscFunctionBegin; 1252cd620004SJunchao Zhang if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 1253cd620004SJunchao Zhang flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ 1254cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1255cd620004SJunchao Zhang if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else 1256cd620004SJunchao Zhang #endif 1257cd620004SJunchao Zhang {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} 1258cd620004SJunchao Zhang } 1259cd620004SJunchao Zhang PetscFunctionReturn(0); 1260cd620004SJunchao Zhang } 1261cd620004SJunchao Zhang 1262cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 1263cd620004SJunchao Zhang Input Arguments: 1264cd620004SJunchao Zhang +sf - The StarForest 1265cd620004SJunchao Zhang .link - The link 1266cd620004SJunchao Zhang .count - Number of entries to unpack 1267cd620004SJunchao Zhang .start - The first index, significent when indices=NULL 1268cd620004SJunchao Zhang .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 1269cd620004SJunchao Zhang .buf - A contiguous buffer to unpack from 1270cd620004SJunchao Zhang -op - Operation after unpack 1271cd620004SJunchao Zhang 1272cd620004SJunchao Zhang Output Arguments: 1273cd620004SJunchao Zhang .data - The data to unpack to 1274cd620004SJunchao Zhang */ 1275cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) 1276cd620004SJunchao Zhang { 1277cd620004SJunchao Zhang PetscFunctionBegin; 1278cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1279cd620004SJunchao Zhang { 1280cd620004SJunchao Zhang PetscErrorCode ierr; 1281cd620004SJunchao Zhang PetscInt i; 1282cd620004SJunchao Zhang PetscMPIInt n; 1283cd620004SJunchao Zhang if (indices) { 1284cd620004SJunchao Zhang /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 1285cd620004SJunchao Zhang basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 1286cd620004SJunchao Zhang */ 1287cd620004SJunchao Zhang for (i=0; i<count; i++) {ierr = MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRQ(ierr);} 1288cd620004SJunchao Zhang } else { 1289cd620004SJunchao Zhang ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); 1290cd620004SJunchao Zhang ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRQ(ierr); 1291cd620004SJunchao Zhang } 1292cd620004SJunchao Zhang } 1293cd620004SJunchao Zhang #else 1294cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1295cd620004SJunchao Zhang #endif 1296cd620004SJunchao Zhang PetscFunctionReturn(0); 1297cd620004SJunchao Zhang } 1298cd620004SJunchao Zhang 1299cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,const PetscInt *idx,const void *xdata,PetscInt starty,const PetscInt *idy,void *ydata,MPI_Op op) 1300cd620004SJunchao Zhang { 1301cd620004SJunchao Zhang PetscFunctionBegin; 1302cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 1303cd620004SJunchao Zhang { 1304cd620004SJunchao Zhang PetscErrorCode ierr; 1305cd620004SJunchao Zhang PetscInt i,disp; 1306cd620004SJunchao Zhang for (i=0; i<count; i++) { 1307cd620004SJunchao Zhang disp = idy? idy[i] : starty + i; 1308cd620004SJunchao Zhang ierr = MPI_Reduce_local((const char*)xdata+idx[i]*link->unitbytes,(char*)ydata+disp*link->unitbytes,1,link->unit,op);CHKERRQ(ierr); 1309cd620004SJunchao Zhang } 1310cd620004SJunchao Zhang } 1311cd620004SJunchao Zhang #else 1312cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); 1313cd620004SJunchao Zhang #endif 1314cd620004SJunchao Zhang PetscFunctionReturn(0); 1315cd620004SJunchao Zhang } 1316cd620004SJunchao Zhang 1317cd620004SJunchao Zhang /*============================================================================= 1318cd620004SJunchao Zhang Pack/Unpack/Fetch/Scatter routines 1319cd620004SJunchao Zhang ============================================================================*/ 1320cd620004SJunchao Zhang 1321cd620004SJunchao Zhang /* Pack rootdata to rootbuf 1322cd620004SJunchao Zhang Input Arguments: 1323cd620004SJunchao Zhang + sf - The SF this packing works on. 1324cd620004SJunchao Zhang . link - It gives the memtype of the roots and also provides root buffer. 1325cd620004SJunchao Zhang . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 1326cd620004SJunchao Zhang - rootdata - Where to read the roots. 1327cd620004SJunchao Zhang 1328cd620004SJunchao Zhang Notes: 1329cd620004SJunchao Zhang When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 1330cd620004SJunchao Zhang in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) 1331cd620004SJunchao Zhang */ 1332cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) 1333cd620004SJunchao Zhang { 1334cd620004SJunchao Zhang PetscErrorCode ierr; 1335cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1336cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1337cd620004SJunchao Zhang PetscInt count,start; 1338cd620004SJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL; 1339cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1340cd620004SJunchao Zhang 1341cd620004SJunchao Zhang PetscFunctionBegin; 1342cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1343cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1344cd620004SJunchao Zhang if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 1345cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1346cd620004SJunchao Zhang ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); 1347cd620004SJunchao Zhang ierr = (*Pack)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1348cd620004SJunchao Zhang } 1349cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) { 1350cd620004SJunchao Zhang ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1351cd620004SJunchao Zhang ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); 1352cd620004SJunchao Zhang } 1353cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1354cd620004SJunchao Zhang PetscFunctionReturn(0); 1355cd620004SJunchao Zhang } 1356cd620004SJunchao Zhang 1357cd620004SJunchao Zhang /* Pack leafdata to leafbuf */ 1358cd620004SJunchao Zhang PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) 1359cd620004SJunchao Zhang { 1360cd620004SJunchao Zhang PetscErrorCode ierr; 1361cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 1362cd620004SJunchao Zhang PetscInt count,start; 1363cd620004SJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*) = NULL; 1364cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 1365cd620004SJunchao Zhang 1366cd620004SJunchao Zhang PetscFunctionBegin; 1367cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1368cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} 1369cd620004SJunchao Zhang if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 1370cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr); 1371cd620004SJunchao Zhang ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); 1372cd620004SJunchao Zhang ierr = (*Pack)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1373cd620004SJunchao Zhang } 1374cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) { 1375cd620004SJunchao Zhang ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); 1376cd620004SJunchao Zhang ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); 1377cd620004SJunchao Zhang } 1378cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); 1379cd620004SJunchao Zhang PetscFunctionReturn(0); 1380cd620004SJunchao Zhang } 1381cd620004SJunchao Zhang 1382cd620004SJunchao Zhang /* Unpack rootbuf to rootdata */ 1383cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1384cd620004SJunchao Zhang { 1385cd620004SJunchao Zhang PetscErrorCode ierr; 1386cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1387cd620004SJunchao Zhang PetscInt count,start; 1388cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1389cd620004SJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1390cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1391cd620004SJunchao Zhang 1392cd620004SJunchao Zhang PetscFunctionBegin; 1393cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1394cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1395cd620004SJunchao Zhang if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 1396cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1397cd620004SJunchao Zhang if (UnpackAndOp) { 1398cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1399cd620004SJunchao Zhang ierr = (*UnpackAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1400cd620004SJunchao Zhang } else { 1401cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1402cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); 1403cd620004SJunchao Zhang } 1404cd620004SJunchao Zhang } 1405cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} 1406cd620004SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1407cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1408cd620004SJunchao Zhang PetscFunctionReturn(0); 1409cd620004SJunchao Zhang } 1410cd620004SJunchao Zhang 1411cd620004SJunchao Zhang /* Unpack leafbuf to leafdata */ 1412cd620004SJunchao Zhang PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) 1413cd620004SJunchao Zhang { 1414cd620004SJunchao Zhang PetscErrorCode ierr; 1415cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 1416cd620004SJunchao Zhang PetscInt count,start; 1417cd620004SJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1418cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 1419cd620004SJunchao Zhang 1420cd620004SJunchao Zhang PetscFunctionBegin; 1421cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1422cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1423cd620004SJunchao Zhang if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 1424cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); 1425cd620004SJunchao Zhang if (UnpackAndOp) { 1426cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,scope,&count,&start,&leafindices);CHKERRQ(ierr); 1427cd620004SJunchao Zhang ierr = (*UnpackAndOp)(link,count,start,leafindices,sf->leafpackopt[scope],leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); 1428cd620004SJunchao Zhang } else { 1429cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&leafindices);CHKERRQ(ierr); 1430cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); 1431cd620004SJunchao Zhang } 1432cd620004SJunchao Zhang } 1433cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} 1434cd620004SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); 1435cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1436cd620004SJunchao Zhang PetscFunctionReturn(0); 1437cd620004SJunchao Zhang } 1438cd620004SJunchao Zhang 1439cd620004SJunchao Zhang /* FetchAndOp rootdata with rootbuf */ 1440cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) 1441cd620004SJunchao Zhang { 1442cd620004SJunchao Zhang PetscErrorCode ierr; 1443cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1444cd620004SJunchao Zhang PetscInt count,start; 1445cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1446cd620004SJunchao Zhang PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*) = NULL; 1447cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1448cd620004SJunchao Zhang 1449cd620004SJunchao Zhang PetscFunctionBegin; 1450cd620004SJunchao Zhang ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1451cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} 1452cd620004SJunchao Zhang if (bas->rootbuflen[scope]) { 1453cd620004SJunchao Zhang /* Do FetchAndOp on rootdata with rootbuf */ 1454cd620004SJunchao Zhang ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); 1455cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,scope,&count,&start,&rootindices);CHKERRQ(ierr); 1456cd620004SJunchao Zhang ierr = (*FetchAndOp)(link,count,start,rootindices,bas->rootpackopt[scope],rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); 1457cd620004SJunchao Zhang } 1458cd620004SJunchao Zhang if (scope == PETSCSF_REMOTE) { 1459cd620004SJunchao Zhang ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); 1460cd620004SJunchao Zhang ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); 1461cd620004SJunchao Zhang } 1462cd620004SJunchao Zhang ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); 1463cd620004SJunchao Zhang ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); 1464cd620004SJunchao Zhang PetscFunctionReturn(0); 1465cd620004SJunchao Zhang } 1466cd620004SJunchao Zhang 1467cd620004SJunchao Zhang /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ 1468cd620004SJunchao Zhang PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) 1469cd620004SJunchao Zhang { 1470cd620004SJunchao Zhang PetscErrorCode ierr; 1471cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1472cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1473cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1474cd620004SJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1475cd620004SJunchao Zhang PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL; 1476cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1477cd620004SJunchao Zhang 1478cd620004SJunchao Zhang PetscFunctionBegin; 1479cd620004SJunchao Zhang if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1480cd620004SJunchao Zhang if (rootmtype != leafmtype) { /* Uncommon case */ 1481cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */ 1482cd620004SJunchao Zhang ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); 1483*f01131f0SJunchao Zhang ierr = PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1484cd620004SJunchao Zhang ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); 1485cd620004SJunchao Zhang } else { 1486cd620004SJunchao Zhang if (bas->rootcontig[PETSCSF_LOCAL]) { /* If root indices are contiguous, Scatter becomes Unpack */ 1487cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr); 1488cd620004SJunchao Zhang rootdata = (const char*)rootdata + bas->rootstart[PETSCSF_LOCAL]*link->unitbytes; /* Make rootdata point to start of the buffer */ 1489cd620004SJunchao Zhang if (UnpackAndOp) { 1490cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr); 1491cd620004SJunchao Zhang ierr = (*UnpackAndOp)(link,count,leafstart,leafindices,sf->leafpackopt[PETSCSF_LOCAL],leafdata,rootdata);CHKERRQ(ierr); 1492cd620004SJunchao Zhang } else { 1493cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr); 1494cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootdata,op);CHKERRQ(ierr); 1495cd620004SJunchao Zhang } 1496cd620004SJunchao Zhang } else { /* ScatterAndOp */ 1497cd620004SJunchao Zhang ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1498cd620004SJunchao Zhang if (ScatterAndOp) { 1499cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1500cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafindices);CHKERRQ(ierr); 1501cd620004SJunchao Zhang ierr = (*ScatterAndOp)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata);CHKERRQ(ierr); 1502cd620004SJunchao Zhang } else { 1503cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,NULL,&rootindices);CHKERRQ(ierr); 1504cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr); 1505cd620004SJunchao Zhang ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); 1506cd620004SJunchao Zhang } 1507cd620004SJunchao Zhang } 1508cd620004SJunchao Zhang } 1509cd620004SJunchao Zhang PetscFunctionReturn(0); 1510cd620004SJunchao Zhang } 1511cd620004SJunchao Zhang 1512cd620004SJunchao Zhang /* Reduce leafdata to rootdata locally */ 1513cd620004SJunchao Zhang PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) 1514cd620004SJunchao Zhang { 1515cd620004SJunchao Zhang PetscErrorCode ierr; 1516cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1517cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1518cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1519cd620004SJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*) = NULL; 1520cd620004SJunchao Zhang PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*) = NULL; 1521cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1522cd620004SJunchao Zhang 1523cd620004SJunchao Zhang PetscFunctionBegin; 1524cd620004SJunchao Zhang if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1525cd620004SJunchao Zhang if (rootmtype != leafmtype) { 1526cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */ 1527cd620004SJunchao Zhang ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); 1528*f01131f0SJunchao Zhang ierr = PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); 1529cd620004SJunchao Zhang ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); 1530cd620004SJunchao Zhang } else { 1531cd620004SJunchao Zhang if (sf->leafcontig[PETSCSF_LOCAL]) { 1532cd620004SJunchao Zhang /* If leaf indices are contiguous, Scatter becomes Unpack */ 1533cd620004SJunchao Zhang ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&UnpackAndOp);CHKERRQ(ierr); 1534cd620004SJunchao Zhang leafdata = (const char*)leafdata + sf->leafstart[PETSCSF_LOCAL]*link->unitbytes; /* Make leafdata point to start of the buffer */ 1535cd620004SJunchao Zhang if (UnpackAndOp) { 1536cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1537cd620004SJunchao Zhang ierr = (*UnpackAndOp)(link,count,rootstart,rootindices,bas->rootpackopt[PETSCSF_LOCAL],rootdata,leafdata);CHKERRQ(ierr); 1538cd620004SJunchao Zhang } else { 1539cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1540cd620004SJunchao Zhang ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafdata,op);CHKERRQ(ierr); 1541cd620004SJunchao Zhang } 1542cd620004SJunchao Zhang } else { 1543cd620004SJunchao Zhang ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); 1544cd620004SJunchao Zhang if (ScatterAndOp) { 1545cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1546cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr); 1547cd620004SJunchao Zhang ierr = (*ScatterAndOp)(link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata);CHKERRQ(ierr); 1548cd620004SJunchao Zhang } else { 1549cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1550cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,NULL,NULL,&leafindices);CHKERRQ(ierr); 1551cd620004SJunchao Zhang ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); 1552cd620004SJunchao Zhang } 1553cd620004SJunchao Zhang } 1554cd620004SJunchao Zhang } 1555cd620004SJunchao Zhang PetscFunctionReturn(0); 1556cd620004SJunchao Zhang } 1557cd620004SJunchao Zhang 1558cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */ 1559cd620004SJunchao Zhang PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) 1560cd620004SJunchao Zhang { 1561cd620004SJunchao Zhang PetscErrorCode ierr; 1562cd620004SJunchao Zhang const PetscInt *rootindices = NULL,*leafindices = NULL; 1563cd620004SJunchao Zhang PetscInt count,rootstart,leafstart; 1564cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1565cd620004SJunchao Zhang PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*) = NULL; 1566cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; 1567cd620004SJunchao Zhang 1568cd620004SJunchao Zhang PetscFunctionBegin; 1569cd620004SJunchao Zhang if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1570cd620004SJunchao Zhang if (rootmtype != leafmtype) { 1571cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */ 1572cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1573cd620004SJunchao Zhang } else { 1574cd620004SJunchao Zhang ierr = PetscSFLinkGetRootIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootindices);CHKERRQ(ierr); 1575cd620004SJunchao Zhang ierr = PetscSFLinkGetLeafIndices(sf,link,leafmtype,PETSCSF_LOCAL,NULL,&leafstart,&leafindices);CHKERRQ(ierr); 1576cd620004SJunchao Zhang ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); 1577cd620004SJunchao Zhang ierr = (*FetchAndOpLocal)(link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,leafupdate);CHKERRQ(ierr); 1578cd620004SJunchao Zhang } 157940e23c03SJunchao Zhang PetscFunctionReturn(0); 158040e23c03SJunchao Zhang } 158140e23c03SJunchao Zhang 158240e23c03SJunchao Zhang /* 1583cd620004SJunchao Zhang Create per-rank pack/unpack optimizations based on indice patterns 158440e23c03SJunchao Zhang 158540e23c03SJunchao Zhang Input Parameters: 1586b23bfdefSJunchao Zhang + n - Number of target ranks 1587eb02082bSJunchao Zhang . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1588b23bfdefSJunchao Zhang - idx - [*] Array storing indices 158940e23c03SJunchao Zhang 159040e23c03SJunchao Zhang Output Parameters: 1591cd620004SJunchao Zhang + opt - Pack optimizations. NULL if no optimizations. 159240e23c03SJunchao Zhang */ 1593cd620004SJunchao Zhang PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) 159440e23c03SJunchao Zhang { 159540e23c03SJunchao Zhang PetscErrorCode ierr; 159640e23c03SJunchao Zhang PetscInt i,j,k,n_copies,tot_copies = 0,step; 1597b23bfdefSJunchao Zhang PetscBool strided,optimized = PETSC_FALSE; 159840e23c03SJunchao Zhang PetscSFPackOpt opt; 159940e23c03SJunchao Zhang 160040e23c03SJunchao Zhang PetscFunctionBegin; 1601b23bfdefSJunchao Zhang if (!n) { 1602b23bfdefSJunchao Zhang *out = NULL; 1603b23bfdefSJunchao Zhang PetscFunctionReturn(0); 1604b23bfdefSJunchao Zhang } 1605b23bfdefSJunchao Zhang 160640e23c03SJunchao Zhang ierr = PetscCalloc1(1,&opt);CHKERRQ(ierr); 1607b23bfdefSJunchao Zhang ierr = PetscCalloc3(n,&opt->type,n+1,&opt->offset,n+1,&opt->copy_offset);CHKERRQ(ierr); 1608b23bfdefSJunchao Zhang ierr = PetscArraycpy(opt->offset,offset,n+1);CHKERRQ(ierr); 1609cd620004SJunchao Zhang /* Make opt->offset[] zero-based. If one calls this routine with non-zero offset[0], one should use packing routine in this way, Pack(count,idx+offset[0],packopt,...) */ 1610cd620004SJunchao Zhang if (offset[0]) {for (i=0; i<n+1; i++) opt->offset[i] -= offset[0];} 1611b23bfdefSJunchao Zhang 1612b23bfdefSJunchao Zhang opt->n = n; 161340e23c03SJunchao Zhang 1614fb61b9e4SStefano Zampini /* Check if the indices are piece-wise contiguous (if yes, we can optimize a packing with multiple memcpy's ) */ 161540e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each target processor */ 161640e23c03SJunchao Zhang /* Scan indices to count n_copies -- the number of contiguous pieces for i-th target */ 161740e23c03SJunchao Zhang n_copies = 1; 161840e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 161940e23c03SJunchao Zhang if (idx[j]+1 != idx[j+1]) n_copies++; 162040e23c03SJunchao Zhang } 162140e23c03SJunchao Zhang /* If the average length (in no. of indices) of contiguous pieces is long enough, say >=32, 162240e23c03SJunchao Zhang then it is worth using memcpy for this target. 32 is an arbitrarily chosen number. 162340e23c03SJunchao Zhang */ 162440e23c03SJunchao Zhang if ((offset[i+1]-offset[i])/n_copies >= 32) { 1625b23bfdefSJunchao Zhang opt->type[i] = PETSCSF_PACKOPT_MULTICOPY; 1626b23bfdefSJunchao Zhang optimized = PETSC_TRUE; 162740e23c03SJunchao Zhang tot_copies += n_copies; 162840e23c03SJunchao Zhang } 162940e23c03SJunchao Zhang } 163040e23c03SJunchao Zhang 163140e23c03SJunchao Zhang /* Setup memcpy plan for each contiguous piece */ 163240e23c03SJunchao Zhang k = 0; /* k-th copy */ 1633b23bfdefSJunchao Zhang ierr = PetscMalloc4(tot_copies,&opt->copy_start,tot_copies,&opt->copy_length,n,&opt->stride_step,n,&opt->stride_n);CHKERRQ(ierr); 1634b23bfdefSJunchao Zhang for (i=0; i<n; i++) { /* for each target processor */ 1635b23bfdefSJunchao Zhang if (opt->type[i] == PETSCSF_PACKOPT_MULTICOPY) { 163640e23c03SJunchao Zhang n_copies = 1; 1637b23bfdefSJunchao Zhang opt->copy_start[k] = offset[i] - offset[0]; 163840e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 163940e23c03SJunchao Zhang if (idx[j]+1 != idx[j+1]) { /* meet end of a copy (and next copy must exist) */ 164040e23c03SJunchao Zhang n_copies++; 1641b23bfdefSJunchao Zhang opt->copy_start[k+1] = j-offset[0]+1; 1642b23bfdefSJunchao Zhang opt->copy_length[k] = opt->copy_start[k+1] - opt->copy_start[k]; 164340e23c03SJunchao Zhang k++; 164440e23c03SJunchao Zhang } 164540e23c03SJunchao Zhang } 164640e23c03SJunchao Zhang /* Set copy length of the last copy for this target */ 1647b23bfdefSJunchao Zhang opt->copy_length[k] = j-offset[0]+1 - opt->copy_start[k]; 164840e23c03SJunchao Zhang k++; 164940e23c03SJunchao Zhang } 1650b23bfdefSJunchao Zhang /* Set offset for next target. When opt->type[i]=PETSCSF_PACKOPT_NONE, copy_offsets[i]=copy_offsets[i+1] */ 165140e23c03SJunchao Zhang opt->copy_offset[i+1] = k; 165240e23c03SJunchao Zhang } 165340e23c03SJunchao Zhang 165440e23c03SJunchao Zhang /* Last chance! If the indices do not have long contiguous pieces, are they strided? */ 165540e23c03SJunchao Zhang for (i=0; i<n; i++) { /* for each remote */ 1656b23bfdefSJunchao Zhang if (opt->type[i]==PETSCSF_PACKOPT_NONE && (offset[i+1] - offset[i]) >= 16) { /* few indices (<16) are not worth striding */ 165740e23c03SJunchao Zhang strided = PETSC_TRUE; 165840e23c03SJunchao Zhang step = idx[offset[i]+1] - idx[offset[i]]; 165940e23c03SJunchao Zhang for (j=offset[i]; j<offset[i+1]-1; j++) { 166040e23c03SJunchao Zhang if (idx[j]+step != idx[j+1]) { strided = PETSC_FALSE; break; } 166140e23c03SJunchao Zhang } 166240e23c03SJunchao Zhang if (strided) { 1663b23bfdefSJunchao Zhang opt->type[i] = PETSCSF_PACKOPT_STRIDE; 166440e23c03SJunchao Zhang opt->stride_step[i] = step; 166540e23c03SJunchao Zhang opt->stride_n[i] = offset[i+1] - offset[i]; 1666b23bfdefSJunchao Zhang optimized = PETSC_TRUE; 166740e23c03SJunchao Zhang } 166840e23c03SJunchao Zhang } 166940e23c03SJunchao Zhang } 1670b23bfdefSJunchao Zhang /* If no rank gets optimized, free arrays to save memory */ 1671b23bfdefSJunchao Zhang if (!optimized) { 1672b23bfdefSJunchao Zhang ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 1673b23bfdefSJunchao Zhang ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 167440e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 167540e23c03SJunchao Zhang *out = NULL; 167640e23c03SJunchao Zhang } else *out = opt; 167740e23c03SJunchao Zhang PetscFunctionReturn(0); 167840e23c03SJunchao Zhang } 167940e23c03SJunchao Zhang 1680cd620004SJunchao Zhang PetscErrorCode PetscSFDestroyPackOpt(PetscSFPackOpt *out) 168140e23c03SJunchao Zhang { 168240e23c03SJunchao Zhang PetscErrorCode ierr; 168340e23c03SJunchao Zhang PetscSFPackOpt opt = *out; 168440e23c03SJunchao Zhang 168540e23c03SJunchao Zhang PetscFunctionBegin; 168640e23c03SJunchao Zhang if (opt) { 1687b23bfdefSJunchao Zhang ierr = PetscFree3(opt->type,opt->offset,opt->copy_offset);CHKERRQ(ierr); 1688b23bfdefSJunchao Zhang ierr = PetscFree4(opt->copy_start,opt->copy_length,opt->stride_step,opt->stride_n);CHKERRQ(ierr); 168940e23c03SJunchao Zhang ierr = PetscFree(opt);CHKERRQ(ierr); 169040e23c03SJunchao Zhang *out = NULL; 169140e23c03SJunchao Zhang } 169240e23c03SJunchao Zhang PetscFunctionReturn(0); 169340e23c03SJunchao Zhang } 1694cd620004SJunchao Zhang 1695cd620004SJunchao Zhang PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) 1696cd620004SJunchao Zhang { 1697cd620004SJunchao Zhang PetscErrorCode ierr; 1698cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1699cd620004SJunchao Zhang PetscInt i,j; 1700cd620004SJunchao Zhang 1701cd620004SJunchao Zhang PetscFunctionBegin; 1702cd620004SJunchao Zhang /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1703cd620004SJunchao Zhang for (i=0; i<2; i++) { /* Set defaults */ 1704cd620004SJunchao Zhang sf->leafstart[i] = 0; 1705cd620004SJunchao Zhang sf->leafcontig[i] = PETSC_TRUE; 1706cd620004SJunchao Zhang sf->leafdups[i] = PETSC_FALSE; 1707cd620004SJunchao Zhang bas->rootstart[i] = 0; 1708cd620004SJunchao Zhang bas->rootcontig[i] = PETSC_TRUE; 1709cd620004SJunchao Zhang bas->rootdups[i] = PETSC_FALSE; 1710cd620004SJunchao Zhang } 1711cd620004SJunchao Zhang 1712cd620004SJunchao Zhang sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1713cd620004SJunchao Zhang sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1714cd620004SJunchao Zhang 1715cd620004SJunchao Zhang if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1716cd620004SJunchao Zhang if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1717cd620004SJunchao Zhang 1718cd620004SJunchao Zhang /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1719cd620004SJunchao Zhang for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */ 1720cd620004SJunchao Zhang if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} 1721cd620004SJunchao Zhang } 1722cd620004SJunchao Zhang for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */ 1723cd620004SJunchao Zhang if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} 1724cd620004SJunchao Zhang } 1725cd620004SJunchao Zhang 1726cd620004SJunchao Zhang /* If not, see if we can have per-rank optimizations by doing index analysis */ 1727cd620004SJunchao Zhang if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} 1728cd620004SJunchao Zhang if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} 1729cd620004SJunchao Zhang 1730cd620004SJunchao Zhang /* Are root indices for self and remote contiguous? */ 1731cd620004SJunchao Zhang bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1732cd620004SJunchao Zhang bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1733cd620004SJunchao Zhang 1734cd620004SJunchao Zhang if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1735cd620004SJunchao Zhang if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1736cd620004SJunchao Zhang 1737cd620004SJunchao Zhang for (i=0; i<bas->ioffset[bas->ndiranks]; i++) { 1738cd620004SJunchao Zhang if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} 1739cd620004SJunchao Zhang } 1740cd620004SJunchao Zhang for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) { 1741cd620004SJunchao Zhang if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} 1742cd620004SJunchao Zhang } 1743cd620004SJunchao Zhang 1744cd620004SJunchao Zhang if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} 1745cd620004SJunchao Zhang if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} 1746cd620004SJunchao Zhang 1747cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1748cd620004SJunchao Zhang /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1749cd620004SJunchao Zhang if (!sf->leafcontig[0]) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} 1750cd620004SJunchao Zhang if (!sf->leafcontig[1]) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} 1751cd620004SJunchao Zhang if (!bas->rootcontig[0]) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} 1752cd620004SJunchao Zhang if (!bas->rootcontig[1]) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} 1753cd620004SJunchao Zhang #endif 1754cd620004SJunchao Zhang 1755cd620004SJunchao Zhang PetscFunctionReturn(0); 1756cd620004SJunchao Zhang } 1757cd620004SJunchao Zhang 1758cd620004SJunchao Zhang PetscErrorCode PetscSFResetPackFields(PetscSF sf) 1759cd620004SJunchao Zhang { 1760cd620004SJunchao Zhang PetscErrorCode ierr; 1761cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 1762cd620004SJunchao Zhang PetscInt i; 1763cd620004SJunchao Zhang 1764cd620004SJunchao Zhang PetscFunctionBegin; 1765cd620004SJunchao Zhang for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { 1766cd620004SJunchao Zhang ierr = PetscSFDestroyPackOpt(&sf->leafpackopt[i]);CHKERRQ(ierr); 1767cd620004SJunchao Zhang ierr = PetscSFDestroyPackOpt(&bas->rootpackopt[i]);CHKERRQ(ierr); 1768cd620004SJunchao Zhang } 1769cd620004SJunchao Zhang PetscFunctionReturn(0); 1770cd620004SJunchao Zhang } 1771