120c24465SJunchao Zhang #include "petsc/private/sfimpl.h" 240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 440e23c03SJunchao Zhang 57fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ 67fd2d3dbSJunchao Zhang 740e23c03SJunchao Zhang /* 840e23c03SJunchao Zhang * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", 940e23c03SJunchao Zhang * therefore we pack data types manually. This file defines packing routines for the standard data types. 1040e23c03SJunchao Zhang */ 1140e23c03SJunchao Zhang 12cd620004SJunchao Zhang #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d 1340e23c03SJunchao Zhang 14cd620004SJunchao Zhang /* Operations working like s += t */ 15*9371c9d4SSatish Balay #define OP_BINARY(op, s, t) \ 16*9371c9d4SSatish Balay do { (s) = (s)op(t); } while (0) /* binary ops in the middle such as +, *, && etc. */ 17*9371c9d4SSatish Balay #define OP_FUNCTION(op, s, t) \ 18*9371c9d4SSatish Balay do { (s) = op((s), (t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ 19*9371c9d4SSatish Balay #define OP_LXOR(op, s, t) \ 20*9371c9d4SSatish Balay do { (s) = (!(s)) != (!(t)); } while (0) /* logical exclusive OR */ 21*9371c9d4SSatish Balay #define OP_ASSIGN(op, s, t) \ 22*9371c9d4SSatish Balay do { (s) = (t); } while (0) 23cd620004SJunchao Zhang /* Ref MPI MAXLOC */ 24cd620004SJunchao Zhang #define OP_XLOC(op, s, t) \ 25cd620004SJunchao Zhang do { \ 26cd620004SJunchao Zhang if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \ 27cd620004SJunchao Zhang else if (!((s).u op(t).u)) s = t; \ 28cd620004SJunchao Zhang } while (0) 2940e23c03SJunchao Zhang 3040e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine 3140e23c03SJunchao Zhang 3240e23c03SJunchao Zhang Arguments of the macro: 33b23bfdefSJunchao Zhang +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. 34fcc7397dSJunchao Zhang .BS Block size for vectorization. It is a factor of bsz. 35b23bfdefSJunchao Zhang -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. 3640e23c03SJunchao Zhang 3740e23c03SJunchao Zhang Arguments of the Pack routine: 38cd620004SJunchao Zhang +count Number of indices in idx[]. 39fcc7397dSJunchao Zhang .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. 40fcc7397dSJunchao Zhang .opt Per-pack optimization plan. NULL means no such plan. 41fcc7397dSJunchao Zhang .idx Indices of entries to packed. 42eb02082bSJunchao Zhang .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. 43cd620004SJunchao Zhang .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). 44cd620004SJunchao Zhang -packed Address of the packed data. 4540e23c03SJunchao Zhang */ 46b23bfdefSJunchao Zhang #define DEF_PackFunc(Type, BS, EQ) \ 47*9371c9d4SSatish Balay static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) { \ 48b23bfdefSJunchao Zhang const Type *u = (const Type *)unpacked, *u2; \ 49b23bfdefSJunchao Zhang Type *p = (Type *)packed, *p2; \ 50fcc7397dSJunchao Zhang PetscInt i, j, k, X, Y, r, bs = link->bs; \ 51fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 52b23bfdefSJunchao Zhang const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 5340e23c03SJunchao Zhang PetscFunctionBegin; \ 549566063dSJacob Faibussowitsch if (!idx) PetscCall(PetscArraycpy(p, u + start * MBS, MBS * count)); /* idx[] are contiguous */ \ 55*9371c9d4SSatish Balay else if (opt) { /* has optimizations available */ p2 = p; \ 56fcc7397dSJunchao Zhang for (r = 0; r < opt->n; r++) { \ 57fcc7397dSJunchao Zhang u2 = u + opt->start[r] * MBS; \ 58fcc7397dSJunchao Zhang X = opt->X[r]; \ 59fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 60fcc7397dSJunchao Zhang for (k = 0; k < opt->dz[r]; k++) \ 61fcc7397dSJunchao Zhang for (j = 0; j < opt->dy[r]; j++) { \ 629566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS)); \ 63fcc7397dSJunchao Zhang p2 += opt->dx[r] * MBS; \ 64fcc7397dSJunchao Zhang } \ 65fcc7397dSJunchao Zhang } \ 66fcc7397dSJunchao Zhang } else { \ 67b23bfdefSJunchao Zhang for (i = 0; i < count; i++) \ 68eb02082bSJunchao Zhang for (j = 0; j < M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \ 69eb02082bSJunchao Zhang for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \ 70b23bfdefSJunchao Zhang p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \ 7140e23c03SJunchao Zhang } \ 7240e23c03SJunchao Zhang PetscFunctionReturn(0); \ 7340e23c03SJunchao Zhang } 7440e23c03SJunchao Zhang 75cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer 76cd620004SJunchao Zhang and inserts into a sparse array. 7740e23c03SJunchao Zhang 7840e23c03SJunchao Zhang Arguments: 79b23bfdefSJunchao Zhang .Type Type of the data 8040e23c03SJunchao Zhang .BS Block size for vectorization 81b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 8240e23c03SJunchao Zhang 8340e23c03SJunchao Zhang Notes: 8440e23c03SJunchao Zhang This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro. 8540e23c03SJunchao Zhang */ 86cd620004SJunchao Zhang #define DEF_UnpackFunc(Type, BS, EQ) \ 87*9371c9d4SSatish Balay static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) { \ 88b23bfdefSJunchao Zhang Type *u = (Type *)unpacked, *u2; \ 89fcc7397dSJunchao Zhang const Type *p = (const Type *)packed; \ 90fcc7397dSJunchao Zhang PetscInt i, j, k, X, Y, r, bs = link->bs; \ 91fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 92b23bfdefSJunchao Zhang const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 9340e23c03SJunchao Zhang PetscFunctionBegin; \ 94b23bfdefSJunchao Zhang if (!idx) { \ 95fcc7397dSJunchao Zhang u += start * MBS; \ 969566063dSJacob Faibussowitsch if (u != p) PetscCall(PetscArraycpy(u, p, count *MBS)); \ 97fcc7397dSJunchao Zhang } else if (opt) { /* has optimizations available */ \ 98fcc7397dSJunchao Zhang for (r = 0; r < opt->n; r++) { \ 99fcc7397dSJunchao Zhang u2 = u + opt->start[r] * MBS; \ 100fcc7397dSJunchao Zhang X = opt->X[r]; \ 101fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 102fcc7397dSJunchao Zhang for (k = 0; k < opt->dz[r]; k++) \ 103fcc7397dSJunchao Zhang for (j = 0; j < opt->dy[r]; j++) { \ 1049566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS)); \ 105fcc7397dSJunchao Zhang p += opt->dx[r] * MBS; \ 106fcc7397dSJunchao Zhang } \ 107fcc7397dSJunchao Zhang } \ 108fcc7397dSJunchao Zhang } else { \ 109b23bfdefSJunchao Zhang for (i = 0; i < count; i++) \ 110b23bfdefSJunchao Zhang for (j = 0; j < M; j++) \ 111cd620004SJunchao Zhang for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \ 11240e23c03SJunchao Zhang } \ 11340e23c03SJunchao Zhang PetscFunctionReturn(0); \ 11440e23c03SJunchao Zhang } 11540e23c03SJunchao Zhang 116cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert 11740e23c03SJunchao Zhang 11840e23c03SJunchao Zhang Arguments: 119cd620004SJunchao Zhang +Opname Name of the Op, such as Add, Mult, LAND, etc. 120b23bfdefSJunchao Zhang .Type Type of the data 12140e23c03SJunchao Zhang .BS Block size for vectorization 122b23bfdefSJunchao Zhang .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const. 123cd620004SJunchao Zhang .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc. 124cd620004SJunchao Zhang .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR 12540e23c03SJunchao Zhang */ 126cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 127*9371c9d4SSatish Balay static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) { \ 128cd620004SJunchao Zhang Type *u = (Type *)unpacked, *u2; \ 129fcc7397dSJunchao Zhang const Type *p = (const Type *)packed; \ 130fcc7397dSJunchao Zhang PetscInt i, j, k, X, Y, r, bs = link->bs; \ 131fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ 132b23bfdefSJunchao Zhang const PetscInt MBS = M * BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ 13340e23c03SJunchao Zhang PetscFunctionBegin; \ 134b23bfdefSJunchao Zhang if (!idx) { \ 135fcc7397dSJunchao Zhang u += start * MBS; \ 136cd620004SJunchao Zhang for (i = 0; i < count; i++) \ 137cd620004SJunchao Zhang for (j = 0; j < M; j++) \ 138*9371c9d4SSatish Balay for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \ 139fcc7397dSJunchao Zhang } else if (opt) { /* idx[] has patterns */ \ 140fcc7397dSJunchao Zhang for (r = 0; r < opt->n; r++) { \ 141fcc7397dSJunchao Zhang u2 = u + opt->start[r] * MBS; \ 142fcc7397dSJunchao Zhang X = opt->X[r]; \ 143fcc7397dSJunchao Zhang Y = opt->Y[r]; \ 144fcc7397dSJunchao Zhang for (k = 0; k < opt->dz[r]; k++) \ 145fcc7397dSJunchao Zhang for (j = 0; j < opt->dy[r]; j++) { \ 146fcc7397dSJunchao Zhang for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \ 147fcc7397dSJunchao Zhang p += opt->dx[r] * MBS; \ 148fcc7397dSJunchao Zhang } \ 149fcc7397dSJunchao Zhang } \ 150fcc7397dSJunchao Zhang } else { \ 151cd620004SJunchao Zhang for (i = 0; i < count; i++) \ 152cd620004SJunchao Zhang for (j = 0; j < M; j++) \ 153*9371c9d4SSatish Balay for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \ 154cd620004SJunchao Zhang } \ 155cd620004SJunchao Zhang PetscFunctionReturn(0); \ 156cd620004SJunchao Zhang } 157cd620004SJunchao Zhang 158cd620004SJunchao Zhang #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 159*9371c9d4SSatish Balay static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) { \ 160fcc7397dSJunchao Zhang Type *u = (Type *)unpacked, *p = (Type *)packed, tmp; \ 161fcc7397dSJunchao Zhang PetscInt i, j, k, r, l, bs = link->bs; \ 162fcc7397dSJunchao Zhang const PetscInt M = (EQ) ? 1 : bs / BS; \ 163fcc7397dSJunchao Zhang const PetscInt MBS = M * BS; \ 164cd620004SJunchao Zhang PetscFunctionBegin; \ 165fcc7397dSJunchao Zhang for (i = 0; i < count; i++) { \ 166fcc7397dSJunchao Zhang r = (!idx ? start + i : idx[i]) * MBS; \ 167fcc7397dSJunchao Zhang l = i * MBS; \ 168b23bfdefSJunchao Zhang for (j = 0; j < M; j++) \ 169b23bfdefSJunchao Zhang for (k = 0; k < BS; k++) { \ 170fcc7397dSJunchao Zhang tmp = u[r + j * BS + k]; \ 171fcc7397dSJunchao Zhang OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \ 172fcc7397dSJunchao Zhang p[l + j * BS + k] = tmp; \ 173cd620004SJunchao Zhang } \ 174cd620004SJunchao Zhang } \ 175cd620004SJunchao Zhang PetscFunctionReturn(0); \ 176cd620004SJunchao Zhang } 177cd620004SJunchao Zhang 178cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \ 179*9371c9d4SSatish Balay static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) { \ 180fcc7397dSJunchao Zhang const Type *u = (const Type *)src; \ 181fcc7397dSJunchao Zhang Type *v = (Type *)dst; \ 182fcc7397dSJunchao Zhang PetscInt i, j, k, s, t, X, Y, bs = link->bs; \ 183cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs / BS; \ 184cd620004SJunchao Zhang const PetscInt MBS = M * BS; \ 185cd620004SJunchao Zhang PetscFunctionBegin; \ 186fcc7397dSJunchao Zhang if (!srcIdx) { /* src is contiguous */ \ 187fcc7397dSJunchao Zhang u += srcStart * MBS; \ 1889566063dSJacob Faibussowitsch PetscCall(CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u)); \ 189fcc7397dSJunchao Zhang } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ 190fcc7397dSJunchao Zhang u += srcOpt->start[0] * MBS; \ 191fcc7397dSJunchao Zhang v += dstStart * MBS; \ 192*9371c9d4SSatish Balay X = srcOpt->X[0]; \ 193*9371c9d4SSatish Balay Y = srcOpt->Y[0]; \ 194fcc7397dSJunchao Zhang for (k = 0; k < srcOpt->dz[0]; k++) \ 195fcc7397dSJunchao Zhang for (j = 0; j < srcOpt->dy[0]; j++) { \ 196fcc7397dSJunchao Zhang for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \ 197fcc7397dSJunchao Zhang v += srcOpt->dx[0] * MBS; \ 198fcc7397dSJunchao Zhang } \ 199fcc7397dSJunchao Zhang } else { /* all other cases */ \ 200fcc7397dSJunchao Zhang for (i = 0; i < count; i++) { \ 201fcc7397dSJunchao Zhang s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \ 202fcc7397dSJunchao Zhang t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \ 203cd620004SJunchao Zhang for (j = 0; j < M; j++) \ 204fcc7397dSJunchao Zhang for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \ 205fcc7397dSJunchao Zhang } \ 206cd620004SJunchao Zhang } \ 207cd620004SJunchao Zhang PetscFunctionReturn(0); \ 208cd620004SJunchao Zhang } 209cd620004SJunchao Zhang 210cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \ 211*9371c9d4SSatish Balay static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) { \ 212fcc7397dSJunchao Zhang Type *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \ 213fcc7397dSJunchao Zhang const Type *ldata = (const Type *)leafdata; \ 214fcc7397dSJunchao Zhang PetscInt i, j, k, r, l, bs = link->bs; \ 215cd620004SJunchao Zhang const PetscInt M = (EQ) ? 1 : bs / BS; \ 216cd620004SJunchao Zhang const PetscInt MBS = M * BS; \ 217cd620004SJunchao Zhang PetscFunctionBegin; \ 218fcc7397dSJunchao Zhang for (i = 0; i < count; i++) { \ 219fcc7397dSJunchao Zhang r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \ 220fcc7397dSJunchao Zhang l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \ 221cd620004SJunchao Zhang for (j = 0; j < M; j++) \ 222cd620004SJunchao Zhang for (k = 0; k < BS; k++) { \ 223fcc7397dSJunchao Zhang lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \ 224fcc7397dSJunchao Zhang OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \ 22540e23c03SJunchao Zhang } \ 22640e23c03SJunchao Zhang } \ 22740e23c03SJunchao Zhang PetscFunctionReturn(0); \ 22840e23c03SJunchao Zhang } 22940e23c03SJunchao Zhang 230b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */ 231b23bfdefSJunchao Zhang #define DEF_Pack(Type, BS, EQ) \ 232*9371c9d4SSatish Balay DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) { \ 233eb02082bSJunchao Zhang link->h_Pack = CPPJoin4(Pack, Type, BS, EQ); \ 234eb02082bSJunchao Zhang link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \ 235cd620004SJunchao Zhang link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \ 23640e23c03SJunchao Zhang } 23740e23c03SJunchao Zhang 238b23bfdefSJunchao Zhang /* Add, Mult ops */ 239b23bfdefSJunchao Zhang #define DEF_Add(Type, BS, EQ) \ 240*9371c9d4SSatish Balay DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) { \ 241eb02082bSJunchao Zhang link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \ 242eb02082bSJunchao Zhang link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type, BS, EQ); \ 243eb02082bSJunchao Zhang link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type, BS, EQ); \ 244cd620004SJunchao Zhang link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \ 245cd620004SJunchao Zhang link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type, BS, EQ); \ 246cd620004SJunchao Zhang link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \ 24740e23c03SJunchao Zhang } 24840e23c03SJunchao Zhang 249b23bfdefSJunchao Zhang /* Max, Min ops */ 250b23bfdefSJunchao Zhang #define DEF_Cmp(Type, BS, EQ) \ 251*9371c9d4SSatish Balay DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) { \ 252eb02082bSJunchao Zhang link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type, BS, EQ); \ 253eb02082bSJunchao Zhang link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type, BS, EQ); \ 254cd620004SJunchao Zhang link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \ 255cd620004SJunchao Zhang link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \ 256b23bfdefSJunchao Zhang } 257b23bfdefSJunchao Zhang 258b23bfdefSJunchao Zhang /* Logical ops. 259cd620004SJunchao Zhang The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid 26040e23c03SJunchao Zhang the compilation warning "empty macro arguments are undefined in ISO C90" 26140e23c03SJunchao Zhang */ 262b23bfdefSJunchao Zhang #define DEF_Log(Type, BS, EQ) \ 263*9371c9d4SSatish Balay DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) { \ 264eb02082bSJunchao Zhang link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \ 265eb02082bSJunchao Zhang link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \ 266eb02082bSJunchao Zhang link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \ 267cd620004SJunchao Zhang link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \ 268cd620004SJunchao Zhang link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \ 269cd620004SJunchao Zhang link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \ 27040e23c03SJunchao Zhang } 27140e23c03SJunchao Zhang 272b23bfdefSJunchao Zhang /* Bitwise ops */ 273b23bfdefSJunchao Zhang #define DEF_Bit(Type, BS, EQ) \ 274*9371c9d4SSatish Balay DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) { \ 275eb02082bSJunchao Zhang link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \ 276eb02082bSJunchao Zhang link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \ 277eb02082bSJunchao Zhang link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \ 278cd620004SJunchao Zhang link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \ 279cd620004SJunchao Zhang link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \ 280cd620004SJunchao Zhang link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \ 28140e23c03SJunchao Zhang } 28240e23c03SJunchao Zhang 283cd620004SJunchao Zhang /* Maxloc, Minloc ops */ 284cd620004SJunchao Zhang #define DEF_Xloc(Type, BS, EQ) \ 285*9371c9d4SSatish Balay DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) { \ 286cd620004SJunchao Zhang link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type, BS, EQ); \ 287cd620004SJunchao Zhang link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type, BS, EQ); \ 288cd620004SJunchao Zhang link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \ 289cd620004SJunchao Zhang link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \ 29040e23c03SJunchao Zhang } 29140e23c03SJunchao Zhang 292b23bfdefSJunchao Zhang #define DEF_IntegerType(Type, BS, EQ) \ 293*9371c9d4SSatish Balay DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) { \ 294b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 295b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 296b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \ 297b23bfdefSJunchao Zhang CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \ 298b23bfdefSJunchao Zhang CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \ 29940e23c03SJunchao Zhang } 30040e23c03SJunchao Zhang 301b23bfdefSJunchao Zhang #define DEF_RealType(Type, BS, EQ) \ 302*9371c9d4SSatish Balay DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) { \ 303b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 304b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 305b23bfdefSJunchao Zhang CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \ 306b23bfdefSJunchao Zhang } 30740e23c03SJunchao Zhang 30840e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 309b23bfdefSJunchao Zhang #define DEF_ComplexType(Type, BS, EQ) \ 310*9371c9d4SSatish Balay DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) { \ 311b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 312b23bfdefSJunchao Zhang CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \ 313b23bfdefSJunchao Zhang } 31440e23c03SJunchao Zhang #endif 315b23bfdefSJunchao Zhang 316b23bfdefSJunchao Zhang #define DEF_DumbType(Type, BS, EQ) \ 317*9371c9d4SSatish Balay DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) { \ 318b23bfdefSJunchao Zhang CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 319b23bfdefSJunchao Zhang } 320b23bfdefSJunchao Zhang 321b23bfdefSJunchao Zhang /* Maxloc, Minloc */ 322cd620004SJunchao Zhang #define DEF_PairType(Type, BS, EQ) \ 323*9371c9d4SSatish Balay DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) { \ 324cd620004SJunchao Zhang CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \ 325cd620004SJunchao Zhang CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \ 326b23bfdefSJunchao Zhang } 327b23bfdefSJunchao Zhang 328b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 1, 1) /* unit = 1 MPIU_INT */ 329b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */ 330b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */ 331b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */ 332b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */ 333b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */ 334b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */ 335b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ 336b23bfdefSJunchao Zhang 337b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ 338*9371c9d4SSatish Balay DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0) 339b23bfdefSJunchao Zhang #endif 340b23bfdefSJunchao Zhang 341b23bfdefSJunchao Zhang /* The typedefs are used to get a typename without space that CPPJoin can handle */ 342b23bfdefSJunchao Zhang typedef signed char SignedChar; 343*9371c9d4SSatish Balay DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0) 344b23bfdefSJunchao Zhang 345b23bfdefSJunchao Zhang typedef unsigned char UnsignedChar; 346*9371c9d4SSatish Balay DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0) 347b23bfdefSJunchao Zhang 348*9371c9d4SSatish Balay DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0) 349b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 350*9371c9d4SSatish Balay DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0) 351b23bfdefSJunchao Zhang #endif 352b23bfdefSJunchao Zhang 353cd620004SJunchao Zhang #define PairType(Type1, Type2) Type1##_##Type2 354*9371c9d4SSatish Balay typedef struct { 355*9371c9d4SSatish Balay int u; 356*9371c9d4SSatish Balay int i; 357*9371c9d4SSatish Balay } PairType(int, int); 358*9371c9d4SSatish Balay typedef struct { 359*9371c9d4SSatish Balay PetscInt u; 360*9371c9d4SSatish Balay PetscInt i; 361*9371c9d4SSatish Balay } PairType(PetscInt, PetscInt); 362*9371c9d4SSatish Balay DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1) 363b23bfdefSJunchao Zhang 364b23bfdefSJunchao Zhang /* If we don't know the basic type, we treat it as a stream of chars or ints */ 365*9371c9d4SSatish Balay DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0) 366b23bfdefSJunchao Zhang 367eb02082bSJunchao Zhang typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ 368*9371c9d4SSatish Balay DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0) 36940e23c03SJunchao Zhang 370*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link) { 371cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 37271438e86SJunchao Zhang PetscInt i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8; 37371438e86SJunchao Zhang 37471438e86SJunchao Zhang PetscFunctionBegin; 37571438e86SJunchao Zhang /* Destroy device-specific fields */ 3769566063dSJacob Faibussowitsch if (link->deviceinited) PetscCall((*link->Destroy)(sf, link)); 37771438e86SJunchao Zhang 37871438e86SJunchao Zhang /* Destroy host related fields */ 3799566063dSJacob Faibussowitsch if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit)); 38071438e86SJunchao Zhang if (!link->use_nvshmem) { 38171438e86SJunchao Zhang for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */ 3829566063dSJacob Faibussowitsch if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i])); 38371438e86SJunchao Zhang } 3849566063dSJacob Faibussowitsch PetscCall(PetscFree(link->reqs)); 38571438e86SJunchao Zhang for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 3869566063dSJacob Faibussowitsch PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST])); 3879566063dSJacob Faibussowitsch PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST])); 38871438e86SJunchao Zhang } 38971438e86SJunchao Zhang } 3909566063dSJacob Faibussowitsch PetscCall(PetscFree(link)); 39171438e86SJunchao Zhang PetscFunctionReturn(0); 39271438e86SJunchao Zhang } 39371438e86SJunchao Zhang 394*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) { 395cd620004SJunchao Zhang PetscFunctionBegin; 3969566063dSJacob Faibussowitsch PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata)); 39771438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM) 39871438e86SJunchao Zhang { 39971438e86SJunchao Zhang PetscBool use_nvshmem; 4009566063dSJacob Faibussowitsch PetscCall(PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem)); 40171438e86SJunchao Zhang if (use_nvshmem) { 4029566063dSJacob Faibussowitsch PetscCall(PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink)); 40371438e86SJunchao Zhang PetscFunctionReturn(0); 404cd620004SJunchao Zhang } 405cd620004SJunchao Zhang } 4067fd2d3dbSJunchao Zhang #endif 4079566063dSJacob Faibussowitsch PetscCall(PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink)); 408cd620004SJunchao Zhang PetscFunctionReturn(0); 409cd620004SJunchao Zhang } 410cd620004SJunchao Zhang 411cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. 412cd620004SJunchao Zhang If the sf uses persistent requests and the requests have not been initialized, then initialize them. 413cd620004SJunchao Zhang */ 414*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs) { 415cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 416c87b50c4SJunchao Zhang PetscInt i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks; 417cd620004SJunchao Zhang const PetscInt *rootoffset, *leafoffset; 418cd620004SJunchao Zhang MPI_Aint disp; 419cd620004SJunchao Zhang MPI_Comm comm = PetscObjectComm((PetscObject)sf); 420cd620004SJunchao Zhang MPI_Datatype unit = link->unit; 421cd620004SJunchao Zhang const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ 422cd620004SJunchao Zhang const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 423cd620004SJunchao Zhang 424cd620004SJunchao Zhang PetscFunctionBegin; 425cd620004SJunchao Zhang /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ 426cd620004SJunchao Zhang if (sf->persistent) { 427cd620004SJunchao Zhang if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { 4289566063dSJacob Faibussowitsch PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL)); 429cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 430cd620004SJunchao Zhang for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 431cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 432c87b50c4SJunchao Zhang cnt = rootoffset[i + 1] - rootoffset[i]; 4339566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 434cd620004SJunchao Zhang } 435cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 436cd620004SJunchao Zhang for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) { 437cd620004SJunchao Zhang disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes; 438c87b50c4SJunchao Zhang cnt = rootoffset[i + 1] - rootoffset[i]; 4399566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j)); 440cd620004SJunchao Zhang } 441cd620004SJunchao Zhang } 442cd620004SJunchao Zhang link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; 443cd620004SJunchao Zhang } 444cd620004SJunchao Zhang 445cd620004SJunchao Zhang if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { 4469566063dSJacob Faibussowitsch PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL)); 447cd620004SJunchao Zhang if (direction == PETSCSF_LEAF2ROOT) { 448cd620004SJunchao Zhang for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 449cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 450c87b50c4SJunchao Zhang cnt = leafoffset[i + 1] - leafoffset[i]; 4519566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 452cd620004SJunchao Zhang } 453cd620004SJunchao Zhang } else { /* PETSCSF_ROOT2LEAF */ 454cd620004SJunchao Zhang for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) { 455cd620004SJunchao Zhang disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes; 456c87b50c4SJunchao Zhang cnt = leafoffset[i + 1] - leafoffset[i]; 4579566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j)); 458cd620004SJunchao Zhang } 459cd620004SJunchao Zhang } 460cd620004SJunchao Zhang link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; 461cd620004SJunchao Zhang } 462cd620004SJunchao Zhang } 463cd620004SJunchao Zhang if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 464cd620004SJunchao Zhang if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 465cd620004SJunchao Zhang if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 466cd620004SJunchao Zhang if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 467cd620004SJunchao Zhang PetscFunctionReturn(0); 468cd620004SJunchao Zhang } 469cd620004SJunchao Zhang 470*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink) { 471cd620004SJunchao Zhang PetscSFLink link, *p; 47240e23c03SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 47340e23c03SJunchao Zhang 47440e23c03SJunchao Zhang PetscFunctionBegin; 47540e23c03SJunchao Zhang /* Look for types in cache */ 47640e23c03SJunchao Zhang for (p = &bas->inuse; (link = *p); p = &link->next) { 47740e23c03SJunchao Zhang PetscBool match; 4789566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 479637e6665SJunchao Zhang if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { 48040e23c03SJunchao Zhang switch (cmode) { 48140e23c03SJunchao Zhang case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ 48240e23c03SJunchao Zhang case PETSC_USE_POINTER: break; 48340e23c03SJunchao Zhang default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode"); 48440e23c03SJunchao Zhang } 48540e23c03SJunchao Zhang *mylink = link; 48640e23c03SJunchao Zhang PetscFunctionReturn(0); 48740e23c03SJunchao Zhang } 48840e23c03SJunchao Zhang } 48940e23c03SJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack"); 49040e23c03SJunchao Zhang } 49140e23c03SJunchao Zhang 492*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink) { 49340e23c03SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 49471438e86SJunchao Zhang PetscSFLink link = *mylink; 49540e23c03SJunchao Zhang 49640e23c03SJunchao Zhang PetscFunctionBegin; 49771438e86SJunchao Zhang link->rootdata = NULL; 49871438e86SJunchao Zhang link->leafdata = NULL; 49971438e86SJunchao Zhang link->next = bas->avail; 50071438e86SJunchao Zhang bas->avail = link; 50171438e86SJunchao Zhang *mylink = NULL; 502eb02082bSJunchao Zhang PetscFunctionReturn(0); 503eb02082bSJunchao Zhang } 504eb02082bSJunchao Zhang 5059d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */ 506*9371c9d4SSatish Balay PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata) { 507cd620004SJunchao Zhang PetscSFLink link, *p; 5089d1c8addSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 5099d1c8addSJunchao Zhang PetscBool match; 5109d1c8addSJunchao Zhang 5119d1c8addSJunchao Zhang PetscFunctionBegin; 512b458e8f1SJose E. Roman if (PetscDefined(USE_DEBUG)) { 51318fb5014SJunchao Zhang /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore 51418fb5014SJunchao Zhang the potential overlapping since this process does not participate in communication. Overlapping is harmless. 51518fb5014SJunchao Zhang */ 516637e6665SJunchao Zhang if (rootdata || leafdata) { 5179d1c8addSJunchao Zhang for (p = &bas->inuse; (link = *p); p = &link->next) { 5189566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 519c9cc58a2SBarry Smith PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata, PETSC_COMM_SELF, PETSC_ERR_SUP, "Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.", rootdata, leafdata); 5209d1c8addSJunchao Zhang } 52118fb5014SJunchao Zhang } 522b458e8f1SJose E. Roman } 5239d1c8addSJunchao Zhang PetscFunctionReturn(0); 5249d1c8addSJunchao Zhang } 5259d1c8addSJunchao Zhang 526*9371c9d4SSatish Balay static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n) { 52720c24465SJunchao Zhang PetscFunctionBegin; 5289566063dSJacob Faibussowitsch if (n) PetscCall(PetscMemcpy(dst, src, n)); 52920c24465SJunchao Zhang PetscFunctionReturn(0); 53020c24465SJunchao Zhang } 53120c24465SJunchao Zhang 532*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit) { 533b23bfdefSJunchao Zhang PetscInt nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0; 534b23bfdefSJunchao Zhang PetscBool is2Int, is2PetscInt; 53540e23c03SJunchao Zhang PetscMPIInt ni, na, nd, combiner; 53640e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 537b23bfdefSJunchao Zhang PetscInt nPetscComplex = 0; 53840e23c03SJunchao Zhang #endif 53940e23c03SJunchao Zhang 54040e23c03SJunchao Zhang PetscFunctionBegin; 5419566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar)); 5429566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar)); 543b23bfdefSJunchao Zhang /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 5449566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt)); 5459566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt)); 5469566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal)); 54740e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 5489566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex)); 54940e23c03SJunchao Zhang #endif 5509566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int)); 5519566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt)); 552b23bfdefSJunchao Zhang /* TODO: shaell we also handle Fortran MPI_2REAL? */ 5539566063dSJacob Faibussowitsch PetscCallMPI(MPI_Type_get_envelope(unit, &ni, &na, &nd, &combiner)); 5545ad15460SJunchao Zhang link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ 555b23bfdefSJunchao Zhang link->bs = 1; /* default */ 55640e23c03SJunchao Zhang 557eb02082bSJunchao Zhang if (is2Int) { 558cd620004SJunchao Zhang PackInit_PairType_int_int_1_1(link); 559eb02082bSJunchao Zhang link->bs = 1; 560eb02082bSJunchao Zhang link->unitbytes = 2 * sizeof(int); 5615ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 562eb02082bSJunchao Zhang link->basicunit = MPI_2INT; 5635ad15460SJunchao Zhang link->unit = MPI_2INT; 564eb02082bSJunchao Zhang } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 565cd620004SJunchao Zhang PackInit_PairType_PetscInt_PetscInt_1_1(link); 566eb02082bSJunchao Zhang link->bs = 1; 567eb02082bSJunchao Zhang link->unitbytes = 2 * sizeof(PetscInt); 568eb02082bSJunchao Zhang link->basicunit = MPIU_2INT; 5695ad15460SJunchao Zhang link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ 5705ad15460SJunchao Zhang link->unit = MPIU_2INT; 571eb02082bSJunchao Zhang } else if (nPetscReal) { 572*9371c9d4SSatish Balay if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); 573*9371c9d4SSatish Balay else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link); 574*9371c9d4SSatish Balay else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); 575*9371c9d4SSatish Balay else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link); 576*9371c9d4SSatish Balay else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); 577*9371c9d4SSatish Balay else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link); 578*9371c9d4SSatish Balay else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); 579*9371c9d4SSatish Balay else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link); 580b23bfdefSJunchao Zhang link->bs = nPetscReal; 581eb02082bSJunchao Zhang link->unitbytes = nPetscReal * sizeof(PetscReal); 58240e23c03SJunchao Zhang link->basicunit = MPIU_REAL; 583*9371c9d4SSatish Balay if (link->bs == 1) { 584*9371c9d4SSatish Balay link->isbuiltin = PETSC_TRUE; 585*9371c9d4SSatish Balay link->unit = MPIU_REAL; 586*9371c9d4SSatish Balay } 587b23bfdefSJunchao Zhang } else if (nPetscInt) { 588*9371c9d4SSatish Balay if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); 589*9371c9d4SSatish Balay else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link); 590*9371c9d4SSatish Balay else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); 591*9371c9d4SSatish Balay else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link); 592*9371c9d4SSatish Balay else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); 593*9371c9d4SSatish Balay else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link); 594*9371c9d4SSatish Balay else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); 595*9371c9d4SSatish Balay else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link); 596b23bfdefSJunchao Zhang link->bs = nPetscInt; 597eb02082bSJunchao Zhang link->unitbytes = nPetscInt * sizeof(PetscInt); 598b23bfdefSJunchao Zhang link->basicunit = MPIU_INT; 599*9371c9d4SSatish Balay if (link->bs == 1) { 600*9371c9d4SSatish Balay link->isbuiltin = PETSC_TRUE; 601*9371c9d4SSatish Balay link->unit = MPIU_INT; 602*9371c9d4SSatish Balay } 603b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) 604b23bfdefSJunchao Zhang } else if (nInt) { 605*9371c9d4SSatish Balay if (nInt == 8) PackInit_IntegerType_int_8_1(link); 606*9371c9d4SSatish Balay else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link); 607*9371c9d4SSatish Balay else if (nInt == 4) PackInit_IntegerType_int_4_1(link); 608*9371c9d4SSatish Balay else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link); 609*9371c9d4SSatish Balay else if (nInt == 2) PackInit_IntegerType_int_2_1(link); 610*9371c9d4SSatish Balay else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link); 611*9371c9d4SSatish Balay else if (nInt == 1) PackInit_IntegerType_int_1_1(link); 612*9371c9d4SSatish Balay else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link); 613b23bfdefSJunchao Zhang link->bs = nInt; 614eb02082bSJunchao Zhang link->unitbytes = nInt * sizeof(int); 615b23bfdefSJunchao Zhang link->basicunit = MPI_INT; 616*9371c9d4SSatish Balay if (link->bs == 1) { 617*9371c9d4SSatish Balay link->isbuiltin = PETSC_TRUE; 618*9371c9d4SSatish Balay link->unit = MPI_INT; 619*9371c9d4SSatish Balay } 620b23bfdefSJunchao Zhang #endif 621b23bfdefSJunchao Zhang } else if (nSignedChar) { 622*9371c9d4SSatish Balay if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); 623*9371c9d4SSatish Balay else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link); 624*9371c9d4SSatish Balay else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); 625*9371c9d4SSatish Balay else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link); 626*9371c9d4SSatish Balay else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); 627*9371c9d4SSatish Balay else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link); 628*9371c9d4SSatish Balay else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); 629*9371c9d4SSatish Balay else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link); 630b23bfdefSJunchao Zhang link->bs = nSignedChar; 631eb02082bSJunchao Zhang link->unitbytes = nSignedChar * sizeof(SignedChar); 632b23bfdefSJunchao Zhang link->basicunit = MPI_SIGNED_CHAR; 633*9371c9d4SSatish Balay if (link->bs == 1) { 634*9371c9d4SSatish Balay link->isbuiltin = PETSC_TRUE; 635*9371c9d4SSatish Balay link->unit = MPI_SIGNED_CHAR; 636*9371c9d4SSatish Balay } 637b23bfdefSJunchao Zhang } else if (nUnsignedChar) { 638*9371c9d4SSatish Balay if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); 639*9371c9d4SSatish Balay else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); 640*9371c9d4SSatish Balay else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); 641*9371c9d4SSatish Balay else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); 642*9371c9d4SSatish Balay else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); 643*9371c9d4SSatish Balay else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); 644*9371c9d4SSatish Balay else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); 645*9371c9d4SSatish Balay else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); 646b23bfdefSJunchao Zhang link->bs = nUnsignedChar; 647eb02082bSJunchao Zhang link->unitbytes = nUnsignedChar * sizeof(UnsignedChar); 648b23bfdefSJunchao Zhang link->basicunit = MPI_UNSIGNED_CHAR; 649*9371c9d4SSatish Balay if (link->bs == 1) { 650*9371c9d4SSatish Balay link->isbuiltin = PETSC_TRUE; 651*9371c9d4SSatish Balay link->unit = MPI_UNSIGNED_CHAR; 652*9371c9d4SSatish Balay } 65340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 654b23bfdefSJunchao Zhang } else if (nPetscComplex) { 655*9371c9d4SSatish Balay if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); 656*9371c9d4SSatish Balay else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); 657*9371c9d4SSatish Balay else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); 658*9371c9d4SSatish Balay else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); 659*9371c9d4SSatish Balay else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); 660*9371c9d4SSatish Balay else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); 661*9371c9d4SSatish Balay else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); 662*9371c9d4SSatish Balay else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); 663b23bfdefSJunchao Zhang link->bs = nPetscComplex; 664eb02082bSJunchao Zhang link->unitbytes = nPetscComplex * sizeof(PetscComplex); 66540e23c03SJunchao Zhang link->basicunit = MPIU_COMPLEX; 666*9371c9d4SSatish Balay if (link->bs == 1) { 667*9371c9d4SSatish Balay link->isbuiltin = PETSC_TRUE; 668*9371c9d4SSatish Balay link->unit = MPIU_COMPLEX; 669*9371c9d4SSatish Balay } 67040e23c03SJunchao Zhang #endif 67140e23c03SJunchao Zhang } else { 672b23bfdefSJunchao Zhang MPI_Aint lb, nbyte; 6739566063dSJacob Faibussowitsch PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte)); 67408401ef6SPierre Jolivet PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb); 675eb02082bSJunchao Zhang if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 676*9371c9d4SSatish Balay if (nbyte == 4) PackInit_DumbType_char_4_1(link); 677*9371c9d4SSatish Balay else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link); 678*9371c9d4SSatish Balay else if (nbyte == 2) PackInit_DumbType_char_2_1(link); 679*9371c9d4SSatish Balay else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link); 680*9371c9d4SSatish Balay else if (nbyte == 1) PackInit_DumbType_char_1_1(link); 681*9371c9d4SSatish Balay else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link); 682eb02082bSJunchao Zhang link->bs = nbyte; 683b23bfdefSJunchao Zhang link->unitbytes = nbyte; 684b23bfdefSJunchao Zhang link->basicunit = MPI_BYTE; 68540e23c03SJunchao Zhang } else { 686eb02082bSJunchao Zhang nInt = nbyte / sizeof(int); 687*9371c9d4SSatish Balay if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); 688*9371c9d4SSatish Balay else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link); 689*9371c9d4SSatish Balay else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); 690*9371c9d4SSatish Balay else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link); 691*9371c9d4SSatish Balay else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); 692*9371c9d4SSatish Balay else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link); 693*9371c9d4SSatish Balay else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); 694*9371c9d4SSatish Balay else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link); 695eb02082bSJunchao Zhang link->bs = nInt; 696b23bfdefSJunchao Zhang link->unitbytes = nbyte; 69740e23c03SJunchao Zhang link->basicunit = MPI_INT; 69840e23c03SJunchao Zhang } 6995ad15460SJunchao Zhang if (link->isbuiltin) link->unit = unit; 70040e23c03SJunchao Zhang } 701b23bfdefSJunchao Zhang 7029566063dSJacob Faibussowitsch if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit, &link->unit)); 70320c24465SJunchao Zhang 70420c24465SJunchao Zhang link->Memcpy = PetscSFLinkMemcpy_Host; 70540e23c03SJunchao Zhang PetscFunctionReturn(0); 70640e23c03SJunchao Zhang } 70740e23c03SJunchao Zhang 708*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *)) { 70940e23c03SJunchao Zhang PetscFunctionBegin; 71040e23c03SJunchao Zhang *UnpackAndOp = NULL; 71171438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) { 71283df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; 713eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; 714eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; 715eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; 716eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; 717eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; 718eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; 719eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; 720eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; 721eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; 722eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; 723eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; 724eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; 725eb02082bSJunchao Zhang } 7267fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 72771438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) { 72883df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; 729eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; 730eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; 731eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; 732eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; 733eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; 734eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; 735eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; 736eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; 737eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; 738eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; 739eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; 740eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; 74171438e86SJunchao Zhang } else if (PetscMemTypeDevice(mtype) && atomic) { 74283df288dSJunchao Zhang if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; 743eb02082bSJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; 744eb02082bSJunchao Zhang else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; 745eb02082bSJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; 746eb02082bSJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; 747eb02082bSJunchao Zhang else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; 748eb02082bSJunchao Zhang else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; 749eb02082bSJunchao Zhang else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; 750eb02082bSJunchao Zhang else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; 751eb02082bSJunchao Zhang else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; 752eb02082bSJunchao Zhang else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; 753eb02082bSJunchao Zhang else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; 754eb02082bSJunchao Zhang else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; 755eb02082bSJunchao Zhang } 756eb02082bSJunchao Zhang #endif 75740e23c03SJunchao Zhang PetscFunctionReturn(0); 75840e23c03SJunchao Zhang } 75940e23c03SJunchao Zhang 760*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *)) { 76140e23c03SJunchao Zhang PetscFunctionBegin; 762cd620004SJunchao Zhang *ScatterAndOp = NULL; 76371438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) { 76483df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; 765cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; 766cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; 767cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; 768cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; 769cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; 770cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; 771cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; 772cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; 773cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; 774cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; 775cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; 776cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; 777eb02082bSJunchao Zhang } 7787fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 77971438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) { 78083df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; 781cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; 782cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; 783cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; 784cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; 785cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; 786cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; 787cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; 788cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; 789cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; 790cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; 791cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; 792cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; 79371438e86SJunchao Zhang } else if (PetscMemTypeDevice(mtype) && atomic) { 79483df288dSJunchao Zhang if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; 795cd620004SJunchao Zhang else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; 796cd620004SJunchao Zhang else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; 797cd620004SJunchao Zhang else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; 798cd620004SJunchao Zhang else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; 799cd620004SJunchao Zhang else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; 800cd620004SJunchao Zhang else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; 801cd620004SJunchao Zhang else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; 802cd620004SJunchao Zhang else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; 803cd620004SJunchao Zhang else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; 804cd620004SJunchao Zhang else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; 805cd620004SJunchao Zhang else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; 806cd620004SJunchao Zhang else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; 807eb02082bSJunchao Zhang } 808eb02082bSJunchao Zhang #endif 809cd620004SJunchao Zhang PetscFunctionReturn(0); 810cd620004SJunchao Zhang } 811cd620004SJunchao Zhang 812*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *)) { 813cd620004SJunchao Zhang PetscFunctionBegin; 814cd620004SJunchao Zhang *FetchAndOp = NULL; 815c9cc58a2SBarry Smith PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp"); 81671438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd; 8177fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 81871438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd; 81971438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd; 820cd620004SJunchao Zhang #endif 821cd620004SJunchao Zhang PetscFunctionReturn(0); 822cd620004SJunchao Zhang } 823cd620004SJunchao Zhang 824*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *)) { 825cd620004SJunchao Zhang PetscFunctionBegin; 826cd620004SJunchao Zhang *FetchAndOpLocal = NULL; 827c9cc58a2SBarry Smith PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp"); 82871438e86SJunchao Zhang if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal; 8297fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 83071438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; 83171438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; 832cd620004SJunchao Zhang #endif 833cd620004SJunchao Zhang PetscFunctionReturn(0); 834cd620004SJunchao Zhang } 835cd620004SJunchao Zhang 836*9371c9d4SSatish Balay static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) { 837cd620004SJunchao Zhang PetscLogDouble flops; 838cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 839cd620004SJunchao Zhang 840cd620004SJunchao Zhang PetscFunctionBegin; 84183df288dSJunchao Zhang if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 842cd620004SJunchao Zhang flops = bas->rootbuflen[scope] * link->bs; /* # of roots in buffer x # of scalars in unit */ 8437fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 844*9371c9d4SSatish Balay if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops)); 845*9371c9d4SSatish Balay else 846cd620004SJunchao Zhang #endif 8479566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 848cd620004SJunchao Zhang } 849cd620004SJunchao Zhang PetscFunctionReturn(0); 850cd620004SJunchao Zhang } 851cd620004SJunchao Zhang 852*9371c9d4SSatish Balay static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op) { 853cd620004SJunchao Zhang PetscLogDouble flops; 854cd620004SJunchao Zhang 855cd620004SJunchao Zhang PetscFunctionBegin; 85683df288dSJunchao Zhang if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ 857cd620004SJunchao Zhang flops = sf->leafbuflen[scope] * link->bs; /* # of roots in buffer x # of scalars in unit */ 8587fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 859*9371c9d4SSatish Balay if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops)); 860*9371c9d4SSatish Balay else 861cd620004SJunchao Zhang #endif 8629566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(flops)); 863cd620004SJunchao Zhang } 864cd620004SJunchao Zhang PetscFunctionReturn(0); 865cd620004SJunchao Zhang } 866cd620004SJunchao Zhang 867cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. 8684165533cSJose E. Roman Input Parameters: 869cd620004SJunchao Zhang +sf - The StarForest 870cd620004SJunchao Zhang .link - The link 871cd620004SJunchao Zhang .count - Number of entries to unpack 872cd620004SJunchao Zhang .start - The first index, significent when indices=NULL 873cd620004SJunchao Zhang .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start> 874cd620004SJunchao Zhang .buf - A contiguous buffer to unpack from 875cd620004SJunchao Zhang -op - Operation after unpack 876cd620004SJunchao Zhang 8774165533cSJose E. Roman Output Parameters: 878cd620004SJunchao Zhang .data - The data to unpack to 879cd620004SJunchao Zhang */ 880*9371c9d4SSatish Balay static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op) { 881cd620004SJunchao Zhang PetscFunctionBegin; 882cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 883cd620004SJunchao Zhang { 884cd620004SJunchao Zhang PetscInt i; 885cd620004SJunchao Zhang if (indices) { 886cd620004SJunchao Zhang /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on 887cd620004SJunchao Zhang basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. 888cd620004SJunchao Zhang */ 8899566063dSJacob Faibussowitsch for (i = 0; i < count; i++) PetscCallMPI(MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op)); 890cd620004SJunchao Zhang } else { 8919566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op)); 892cd620004SJunchao Zhang } 893cd620004SJunchao Zhang } 894b458e8f1SJose E. Roman PetscFunctionReturn(0); 895cd620004SJunchao Zhang #else 896cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op"); 897cd620004SJunchao Zhang #endif 898cd620004SJunchao Zhang } 899cd620004SJunchao Zhang 900*9371c9d4SSatish Balay static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op) { 901cd620004SJunchao Zhang PetscFunctionBegin; 902cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) 903cd620004SJunchao Zhang { 904cd620004SJunchao Zhang PetscInt i, disp; 905fcc7397dSJunchao Zhang if (!srcIdx) { 9069566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op)); 907fcc7397dSJunchao Zhang } else { 908cd620004SJunchao Zhang for (i = 0; i < count; i++) { 909fcc7397dSJunchao Zhang disp = dstIdx ? dstIdx[i] : dstStart + i; 9109566063dSJacob Faibussowitsch PetscCallMPI(MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op)); 911fcc7397dSJunchao Zhang } 912cd620004SJunchao Zhang } 913cd620004SJunchao Zhang } 914b458e8f1SJose E. Roman PetscFunctionReturn(0); 915cd620004SJunchao Zhang #else 916cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op"); 917cd620004SJunchao Zhang #endif 918cd620004SJunchao Zhang } 919cd620004SJunchao Zhang 920cd620004SJunchao Zhang /*============================================================================= 921cd620004SJunchao Zhang Pack/Unpack/Fetch/Scatter routines 922cd620004SJunchao Zhang ============================================================================*/ 923cd620004SJunchao Zhang 924cd620004SJunchao Zhang /* Pack rootdata to rootbuf 9254165533cSJose E. Roman Input Parameters: 926cd620004SJunchao Zhang + sf - The SF this packing works on. 927cd620004SJunchao Zhang . link - It gives the memtype of the roots and also provides root buffer. 928cd620004SJunchao Zhang . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. 929cd620004SJunchao Zhang - rootdata - Where to read the roots. 930cd620004SJunchao Zhang 931cd620004SJunchao Zhang Notes: 932cd620004SJunchao Zhang When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is 93371438e86SJunchao Zhang in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not) 934cd620004SJunchao Zhang */ 935*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) { 936cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 937cd620004SJunchao Zhang PetscInt count, start; 938fcc7397dSJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 939cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 940fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 941fcc7397dSJunchao Zhang 942cd620004SJunchao Zhang PetscFunctionBegin; 9439566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 94471438e86SJunchao Zhang if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */ 9459566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices)); 9469566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetPack(link, rootmtype, &Pack)); 9479566063dSJacob Faibussowitsch PetscCall((*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype])); 948cd620004SJunchao Zhang } 9499566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 950cd620004SJunchao Zhang PetscFunctionReturn(0); 951cd620004SJunchao Zhang } 952cd620004SJunchao Zhang 953cd620004SJunchao Zhang /* Pack leafdata to leafbuf */ 954*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) { 955cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 956cd620004SJunchao Zhang PetscInt count, start; 957fcc7397dSJunchao Zhang PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 958cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 959fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 960cd620004SJunchao Zhang 961cd620004SJunchao Zhang PetscFunctionBegin; 9629566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 96371438e86SJunchao Zhang if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */ 9649566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices)); 9659566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetPack(link, leafmtype, &Pack)); 9669566063dSJacob Faibussowitsch PetscCall((*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype])); 967cd620004SJunchao Zhang } 9689566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 969cd620004SJunchao Zhang PetscFunctionReturn(0); 970cd620004SJunchao Zhang } 971cd620004SJunchao Zhang 97271438e86SJunchao Zhang /* Pack rootdata to rootbuf, which are in the same memory space */ 973*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata) { 97471438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 97571438e86SJunchao Zhang 97671438e86SJunchao Zhang PetscFunctionBegin; 97771438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */ 9789566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 9799566063dSJacob Faibussowitsch if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 98071438e86SJunchao Zhang } 9819566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 9829566063dSJacob Faibussowitsch if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf, link, scope, rootdata)); 9839566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 98471438e86SJunchao Zhang PetscFunctionReturn(0); 98571438e86SJunchao Zhang } 98671438e86SJunchao Zhang /* Pack leafdata to leafbuf, which are in the same memory space */ 987*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata) { 98871438e86SJunchao Zhang PetscFunctionBegin; 98971438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { 9909566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 9919566063dSJacob Faibussowitsch if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 99271438e86SJunchao Zhang } 9939566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0)); 9949566063dSJacob Faibussowitsch if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata)); 9959566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0)); 99671438e86SJunchao Zhang PetscFunctionReturn(0); 99771438e86SJunchao Zhang } 99871438e86SJunchao Zhang 999*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) { 1000cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1001cd620004SJunchao Zhang PetscInt count, start; 1002cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1003fcc7397dSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL; 1004cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1005fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1006cd620004SJunchao Zhang 1007cd620004SJunchao Zhang PetscFunctionBegin; 100871438e86SJunchao Zhang if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ 10099566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp)); 1010cd620004SJunchao Zhang if (UnpackAndOp) { 10119566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices)); 10129566063dSJacob Faibussowitsch PetscCall((*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype])); 1013cd620004SJunchao Zhang } else { 10149566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices)); 10159566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op)); 1016cd620004SJunchao Zhang } 1017cd620004SJunchao Zhang } 10189566063dSJacob Faibussowitsch PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op)); 1019cd620004SJunchao Zhang PetscFunctionReturn(0); 1020cd620004SJunchao Zhang } 1021cd620004SJunchao Zhang 1022*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) { 1023cd620004SJunchao Zhang const PetscInt *leafindices = NULL; 1024cd620004SJunchao Zhang PetscInt count, start; 1025fcc7397dSJunchao Zhang PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL; 1026cd620004SJunchao Zhang PetscMemType leafmtype = link->leafmtype; 1027fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1028cd620004SJunchao Zhang 1029cd620004SJunchao Zhang PetscFunctionBegin; 103071438e86SJunchao Zhang if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ 10319566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp)); 1032cd620004SJunchao Zhang if (UnpackAndOp) { 10339566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices)); 10349566063dSJacob Faibussowitsch PetscCall((*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype])); 1035cd620004SJunchao Zhang } else { 10369566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices)); 10379566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op)); 1038cd620004SJunchao Zhang } 1039cd620004SJunchao Zhang } 10409566063dSJacob Faibussowitsch PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op)); 104171438e86SJunchao Zhang PetscFunctionReturn(0); 104271438e86SJunchao Zhang } 104371438e86SJunchao Zhang /* Unpack rootbuf to rootdata, which are in the same memory space */ 1044*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op) { 104571438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 104671438e86SJunchao Zhang 104771438e86SJunchao Zhang PetscFunctionBegin; 10489566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 10499566063dSJacob Faibussowitsch if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op)); 10509566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 105171438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { 10529566063dSJacob Faibussowitsch if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */ 10539566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 105471438e86SJunchao Zhang } 1055cd620004SJunchao Zhang PetscFunctionReturn(0); 1056cd620004SJunchao Zhang } 1057cd620004SJunchao Zhang 105871438e86SJunchao Zhang /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */ 1059*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op) { 106071438e86SJunchao Zhang PetscFunctionBegin; 10619566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 10629566063dSJacob Faibussowitsch if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op)); 10639566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 106471438e86SJunchao Zhang if (scope == PETSCSF_REMOTE) { 10659566063dSJacob Faibussowitsch if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */ 10669566063dSJacob Faibussowitsch if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link)); 106771438e86SJunchao Zhang } 106871438e86SJunchao Zhang PetscFunctionReturn(0); 106971438e86SJunchao Zhang } 107071438e86SJunchao Zhang 107171438e86SJunchao Zhang /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */ 1072*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op) { 1073cd620004SJunchao Zhang const PetscInt *rootindices = NULL; 1074cd620004SJunchao Zhang PetscInt count, start; 1075cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1076fcc7397dSJunchao Zhang PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL; 1077cd620004SJunchao Zhang PetscMemType rootmtype = link->rootmtype; 1078fcc7397dSJunchao Zhang PetscSFPackOpt opt = NULL; 1079cd620004SJunchao Zhang 1080cd620004SJunchao Zhang PetscFunctionBegin; 10819566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0)); 108271438e86SJunchao Zhang if (bas->rootbuflen[PETSCSF_REMOTE]) { 1083cd620004SJunchao Zhang /* Do FetchAndOp on rootdata with rootbuf */ 10849566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp)); 10859566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices)); 10869566063dSJacob Faibussowitsch PetscCall((*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype])); 1087cd620004SJunchao Zhang } 10889566063dSJacob Faibussowitsch PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op)); 10899566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0)); 1090cd620004SJunchao Zhang PetscFunctionReturn(0); 1091cd620004SJunchao Zhang } 1092cd620004SJunchao Zhang 1093*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op) { 1094cd620004SJunchao Zhang const PetscInt *rootindices = NULL, *leafindices = NULL; 1095cd620004SJunchao Zhang PetscInt count, rootstart, leafstart; 1096cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1097fcc7397dSJunchao Zhang PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL; 109871438e86SJunchao Zhang PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype; 1099fcc7397dSJunchao Zhang PetscSFPackOpt leafopt = NULL, rootopt = NULL; 110071438e86SJunchao Zhang PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL]; 110171438e86SJunchao Zhang char *srcbuf = NULL, *dstbuf = NULL; 110271438e86SJunchao Zhang PetscBool dstdups; 1103cd620004SJunchao Zhang 1104362febeeSStefano Zampini PetscFunctionBegin; 110571438e86SJunchao Zhang if (!buflen) PetscFunctionReturn(0); 110671438e86SJunchao Zhang if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */ 110771438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 11089566063dSJacob Faibussowitsch PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata)); 110971438e86SJunchao Zhang srcmtype = rootmtype; 111071438e86SJunchao Zhang srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 111171438e86SJunchao Zhang dstmtype = leafmtype; 111271438e86SJunchao Zhang dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 111371438e86SJunchao Zhang } else { 11149566063dSJacob Faibussowitsch PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata)); 111571438e86SJunchao Zhang srcmtype = leafmtype; 111671438e86SJunchao Zhang srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype]; 111771438e86SJunchao Zhang dstmtype = rootmtype; 111871438e86SJunchao Zhang dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype]; 111971438e86SJunchao Zhang } 11209566063dSJacob Faibussowitsch PetscCall((*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes)); 112171438e86SJunchao Zhang /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */ 11229566063dSJacob Faibussowitsch if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link)); 112371438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 11249566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op)); 1125cd620004SJunchao Zhang } else { 11269566063dSJacob Faibussowitsch PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op)); 112771438e86SJunchao Zhang } 112871438e86SJunchao Zhang } else { 112971438e86SJunchao Zhang dstdups = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL]; 113071438e86SJunchao Zhang dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype; 11319566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp)); 1132cd620004SJunchao Zhang if (ScatterAndOp) { 11339566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 11349566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 113571438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 11369566063dSJacob Faibussowitsch PetscCall((*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata)); 1137cd620004SJunchao Zhang } else { 11389566063dSJacob Faibussowitsch PetscCall((*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata)); 113971438e86SJunchao Zhang } 1140cd620004SJunchao Zhang } else { 11419566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 11429566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 114371438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 11449566063dSJacob Faibussowitsch PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op)); 114571438e86SJunchao Zhang } else { 11469566063dSJacob Faibussowitsch PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op)); 1147cd620004SJunchao Zhang } 1148cd620004SJunchao Zhang } 114971438e86SJunchao Zhang } 1150cd620004SJunchao Zhang PetscFunctionReturn(0); 1151cd620004SJunchao Zhang } 1152cd620004SJunchao Zhang 1153cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */ 1154*9371c9d4SSatish Balay PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op) { 1155cd620004SJunchao Zhang const PetscInt *rootindices = NULL, *leafindices = NULL; 1156cd620004SJunchao Zhang PetscInt count, rootstart, leafstart; 1157cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1158fcc7397dSJunchao Zhang PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL; 1159cd620004SJunchao Zhang const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype; 1160fcc7397dSJunchao Zhang PetscSFPackOpt leafopt = NULL, rootopt = NULL; 1161cd620004SJunchao Zhang 1162cd620004SJunchao Zhang PetscFunctionBegin; 1163cd620004SJunchao Zhang if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); 1164cd620004SJunchao Zhang if (rootmtype != leafmtype) { 1165cd620004SJunchao Zhang /* The local communication has to go through pack and unpack */ 1166cd620004SJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); 1167cd620004SJunchao Zhang } else { 11689566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices)); 11699566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices)); 11709566063dSJacob Faibussowitsch PetscCall(PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal)); 11719566063dSJacob Faibussowitsch PetscCall((*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate)); 1172cd620004SJunchao Zhang } 117340e23c03SJunchao Zhang PetscFunctionReturn(0); 117440e23c03SJunchao Zhang } 117540e23c03SJunchao Zhang 117640e23c03SJunchao Zhang /* 1177cd620004SJunchao Zhang Create per-rank pack/unpack optimizations based on indice patterns 117840e23c03SJunchao Zhang 117940e23c03SJunchao Zhang Input Parameters: 1180fcc7397dSJunchao Zhang + n - Number of destination ranks 1181eb02082bSJunchao Zhang . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. 1182b23bfdefSJunchao Zhang - idx - [*] Array storing indices 118340e23c03SJunchao Zhang 118440e23c03SJunchao Zhang Output Parameters: 1185cd620004SJunchao Zhang + opt - Pack optimizations. NULL if no optimizations. 118640e23c03SJunchao Zhang */ 1187*9371c9d4SSatish Balay PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out) { 1188fcc7397dSJunchao Zhang PetscInt r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y; 1189fcc7397dSJunchao Zhang PetscBool optimizable = PETSC_TRUE; 119040e23c03SJunchao Zhang PetscSFPackOpt opt; 119140e23c03SJunchao Zhang 119240e23c03SJunchao Zhang PetscFunctionBegin; 11939566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(1, &opt)); 11949566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(7 * n + 2, &opt->array)); 1195fcc7397dSJunchao Zhang opt->n = opt->array[0] = n; 1196fcc7397dSJunchao Zhang opt->offset = opt->array + 1; 1197fcc7397dSJunchao Zhang opt->start = opt->array + n + 2; 1198fcc7397dSJunchao Zhang opt->dx = opt->array + 2 * n + 2; 1199fcc7397dSJunchao Zhang opt->dy = opt->array + 3 * n + 2; 1200fcc7397dSJunchao Zhang opt->dz = opt->array + 4 * n + 2; 1201fcc7397dSJunchao Zhang opt->X = opt->array + 5 * n + 2; 1202fcc7397dSJunchao Zhang opt->Y = opt->array + 6 * n + 2; 1203fcc7397dSJunchao Zhang 1204fcc7397dSJunchao Zhang for (r = 0; r < n; r++) { /* For each destination rank */ 1205fcc7397dSJunchao Zhang m = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */ 1206fcc7397dSJunchao Zhang p = offset[r]; 1207fcc7397dSJunchao Zhang start = idx[p]; /* First index for this rank */ 1208fcc7397dSJunchao Zhang p++; 1209fcc7397dSJunchao Zhang 1210fcc7397dSJunchao Zhang /* Search in X dimension */ 1211fcc7397dSJunchao Zhang for (dx = 1; dx < m; dx++, p++) { 1212fcc7397dSJunchao Zhang if (start + dx != idx[p]) break; 1213b23bfdefSJunchao Zhang } 1214b23bfdefSJunchao Zhang 1215fcc7397dSJunchao Zhang dydz = m / dx; 1216fcc7397dSJunchao Zhang X = dydz > 1 ? (idx[p] - start) : dx; 1217fcc7397dSJunchao Zhang /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ 1218*9371c9d4SSatish Balay if (m % dx || X <= 0) { 1219*9371c9d4SSatish Balay optimizable = PETSC_FALSE; 1220*9371c9d4SSatish Balay goto finish; 1221*9371c9d4SSatish Balay } 1222fcc7397dSJunchao Zhang for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */ 1223fcc7397dSJunchao Zhang for (i = 0; i < dx; i++, p++) { 1224fcc7397dSJunchao Zhang if (start + X * dy + i != idx[p]) { 1225*9371c9d4SSatish Balay if (i) { 1226*9371c9d4SSatish Balay optimizable = PETSC_FALSE; 1227*9371c9d4SSatish Balay goto finish; 1228*9371c9d4SSatish Balay } /* The pattern is violated in the middle of an x-walk */ 1229*9371c9d4SSatish Balay else 1230*9371c9d4SSatish Balay goto Z_dimension; 123140e23c03SJunchao Zhang } 123240e23c03SJunchao Zhang } 123340e23c03SJunchao Zhang } 123440e23c03SJunchao Zhang 1235fcc7397dSJunchao Zhang Z_dimension: 1236fcc7397dSJunchao Zhang dz = m / (dx * dy); 1237fcc7397dSJunchao Zhang Y = dz > 1 ? (idx[p] - start) / X : dy; 1238fcc7397dSJunchao Zhang /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ 1239*9371c9d4SSatish Balay if (m % (dx * dy) || Y <= 0) { 1240*9371c9d4SSatish Balay optimizable = PETSC_FALSE; 1241*9371c9d4SSatish Balay goto finish; 1242*9371c9d4SSatish Balay } 1243fcc7397dSJunchao Zhang for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */ 1244fcc7397dSJunchao Zhang for (j = 0; j < dy; j++) { 1245fcc7397dSJunchao Zhang for (i = 0; i < dx; i++, p++) { 1246*9371c9d4SSatish Balay if (start + X * Y * k + X * j + i != idx[p]) { 1247*9371c9d4SSatish Balay optimizable = PETSC_FALSE; 1248*9371c9d4SSatish Balay goto finish; 1249*9371c9d4SSatish Balay } 125040e23c03SJunchao Zhang } 125140e23c03SJunchao Zhang } 125240e23c03SJunchao Zhang } 1253fcc7397dSJunchao Zhang opt->start[r] = start; 1254fcc7397dSJunchao Zhang opt->dx[r] = dx; 1255fcc7397dSJunchao Zhang opt->dy[r] = dy; 1256fcc7397dSJunchao Zhang opt->dz[r] = dz; 1257fcc7397dSJunchao Zhang opt->X[r] = X; 1258fcc7397dSJunchao Zhang opt->Y[r] = Y; 125940e23c03SJunchao Zhang } 126040e23c03SJunchao Zhang 1261fcc7397dSJunchao Zhang finish: 1262fcc7397dSJunchao Zhang /* If not optimizable, free arrays to save memory */ 1263fcc7397dSJunchao Zhang if (!n || !optimizable) { 12649566063dSJacob Faibussowitsch PetscCall(PetscFree(opt->array)); 12659566063dSJacob Faibussowitsch PetscCall(PetscFree(opt)); 126640e23c03SJunchao Zhang *out = NULL; 1267fcc7397dSJunchao Zhang } else { 1268fcc7397dSJunchao Zhang opt->offset[0] = 0; 1269fcc7397dSJunchao Zhang for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r]; 1270fcc7397dSJunchao Zhang *out = opt; 1271fcc7397dSJunchao Zhang } 127240e23c03SJunchao Zhang PetscFunctionReturn(0); 127340e23c03SJunchao Zhang } 127440e23c03SJunchao Zhang 1275*9371c9d4SSatish Balay static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out) { 127640e23c03SJunchao Zhang PetscSFPackOpt opt = *out; 127740e23c03SJunchao Zhang 127840e23c03SJunchao Zhang PetscFunctionBegin; 127940e23c03SJunchao Zhang if (opt) { 12809566063dSJacob Faibussowitsch PetscCall(PetscSFFree(sf, mtype, opt->array)); 12819566063dSJacob Faibussowitsch PetscCall(PetscFree(opt)); 128240e23c03SJunchao Zhang *out = NULL; 128340e23c03SJunchao Zhang } 128440e23c03SJunchao Zhang PetscFunctionReturn(0); 128540e23c03SJunchao Zhang } 1286cd620004SJunchao Zhang 1287*9371c9d4SSatish Balay PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) { 1288cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1289cd620004SJunchao Zhang PetscInt i, j; 1290cd620004SJunchao Zhang 1291cd620004SJunchao Zhang PetscFunctionBegin; 1292cd620004SJunchao Zhang /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ 1293cd620004SJunchao Zhang for (i = 0; i < 2; i++) { /* Set defaults */ 1294cd620004SJunchao Zhang sf->leafstart[i] = 0; 1295cd620004SJunchao Zhang sf->leafcontig[i] = PETSC_TRUE; 1296cd620004SJunchao Zhang sf->leafdups[i] = PETSC_FALSE; 1297cd620004SJunchao Zhang bas->rootstart[i] = 0; 1298cd620004SJunchao Zhang bas->rootcontig[i] = PETSC_TRUE; 1299cd620004SJunchao Zhang bas->rootdups[i] = PETSC_FALSE; 1300cd620004SJunchao Zhang } 1301cd620004SJunchao Zhang 1302cd620004SJunchao Zhang sf->leafbuflen[0] = sf->roffset[sf->ndranks]; 1303cd620004SJunchao Zhang sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; 1304cd620004SJunchao Zhang 1305cd620004SJunchao Zhang if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; 1306cd620004SJunchao Zhang if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; 1307cd620004SJunchao Zhang 1308cd620004SJunchao Zhang /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ 1309cd620004SJunchao Zhang for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */ 1310*9371c9d4SSatish Balay if (sf->rmine[i] != sf->leafstart[0] + i) { 1311*9371c9d4SSatish Balay sf->leafcontig[0] = PETSC_FALSE; 1312*9371c9d4SSatish Balay break; 1313*9371c9d4SSatish Balay } 1314cd620004SJunchao Zhang } 1315cd620004SJunchao Zhang for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */ 1316*9371c9d4SSatish Balay if (sf->rmine[i] != sf->leafstart[1] + j) { 1317*9371c9d4SSatish Balay sf->leafcontig[1] = PETSC_FALSE; 1318*9371c9d4SSatish Balay break; 1319*9371c9d4SSatish Balay } 1320cd620004SJunchao Zhang } 1321cd620004SJunchao Zhang 1322cd620004SJunchao Zhang /* If not, see if we can have per-rank optimizations by doing index analysis */ 13239566063dSJacob Faibussowitsch if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0])); 13249566063dSJacob Faibussowitsch if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1])); 1325cd620004SJunchao Zhang 1326cd620004SJunchao Zhang /* Are root indices for self and remote contiguous? */ 1327cd620004SJunchao Zhang bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; 1328cd620004SJunchao Zhang bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; 1329cd620004SJunchao Zhang 1330cd620004SJunchao Zhang if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; 1331cd620004SJunchao Zhang if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; 1332cd620004SJunchao Zhang 1333cd620004SJunchao Zhang for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) { 1334*9371c9d4SSatish Balay if (bas->irootloc[i] != bas->rootstart[0] + i) { 1335*9371c9d4SSatish Balay bas->rootcontig[0] = PETSC_FALSE; 1336*9371c9d4SSatish Balay break; 1337*9371c9d4SSatish Balay } 1338cd620004SJunchao Zhang } 1339cd620004SJunchao Zhang for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) { 1340*9371c9d4SSatish Balay if (bas->irootloc[i] != bas->rootstart[1] + j) { 1341*9371c9d4SSatish Balay bas->rootcontig[1] = PETSC_FALSE; 1342*9371c9d4SSatish Balay break; 1343*9371c9d4SSatish Balay } 1344cd620004SJunchao Zhang } 1345cd620004SJunchao Zhang 13469566063dSJacob Faibussowitsch if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0])); 13479566063dSJacob Faibussowitsch if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1])); 1348cd620004SJunchao Zhang 1349cd620004SJunchao Zhang /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ 1350013b3241SStefano Zampini if (PetscDefined(HAVE_DEVICE)) { 1351013b3241SStefano Zampini PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; 13529566063dSJacob Faibussowitsch if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0])); 13539566063dSJacob Faibussowitsch if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine + sf->roffset[sf->ndranks], &sf->leafdups[1])); 13549566063dSJacob Faibussowitsch if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0])); 13559566063dSJacob Faibussowitsch if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc + bas->ioffset[bas->ndiranks], &bas->rootdups[1])); 1356013b3241SStefano Zampini } 1357cd620004SJunchao Zhang PetscFunctionReturn(0); 1358cd620004SJunchao Zhang } 1359cd620004SJunchao Zhang 1360*9371c9d4SSatish Balay PetscErrorCode PetscSFResetPackFields(PetscSF sf) { 1361cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 1362cd620004SJunchao Zhang PetscInt i; 1363cd620004SJunchao Zhang 1364cd620004SJunchao Zhang PetscFunctionBegin; 1365cd620004SJunchao Zhang for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 13669566063dSJacob Faibussowitsch PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i])); 13679566063dSJacob Faibussowitsch PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i])); 13687fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 13699566063dSJacob Faibussowitsch PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i])); 13709566063dSJacob Faibussowitsch PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i])); 13717fd2d3dbSJunchao Zhang #endif 1372cd620004SJunchao Zhang } 1373cd620004SJunchao Zhang PetscFunctionReturn(0); 1374cd620004SJunchao Zhang } 1375