1914b7a73SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 2914b7a73SJunchao Zhang 3*524fe776SJunchao Zhang #include <petsc_kokkos.hpp> 4914b7a73SJunchao Zhang 5914b7a73SJunchao Zhang using DeviceExecutionSpace = Kokkos::DefaultExecutionSpace; 6914b7a73SJunchao Zhang using DeviceMemorySpace = typename DeviceExecutionSpace::memory_space; 7914b7a73SJunchao Zhang using HostMemorySpace = Kokkos::HostSpace; 8914b7a73SJunchao Zhang 9914b7a73SJunchao Zhang typedef Kokkos::View<char *, DeviceMemorySpace> deviceBuffer_t; 10914b7a73SJunchao Zhang typedef Kokkos::View<char *, HostMemorySpace> HostBuffer_t; 11914b7a73SJunchao Zhang 12914b7a73SJunchao Zhang typedef Kokkos::View<const char *, DeviceMemorySpace> deviceConstBuffer_t; 13914b7a73SJunchao Zhang typedef Kokkos::View<const char *, HostMemorySpace> HostConstBuffer_t; 14914b7a73SJunchao Zhang 15914b7a73SJunchao Zhang /*====================================================================================*/ 16914b7a73SJunchao Zhang /* Regular operations */ 17914b7a73SJunchao Zhang /*====================================================================================*/ 189371c9d4SSatish Balay template <typename Type> 199371c9d4SSatish Balay struct Insert { 20d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const 21d71ae5a4SJacob Faibussowitsch { 229371c9d4SSatish Balay Type old = x; 239371c9d4SSatish Balay x = y; 249371c9d4SSatish Balay return old; 259371c9d4SSatish Balay } 269371c9d4SSatish Balay }; 279371c9d4SSatish Balay template <typename Type> 289371c9d4SSatish Balay struct Add { 29d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const 30d71ae5a4SJacob Faibussowitsch { 319371c9d4SSatish Balay Type old = x; 329371c9d4SSatish Balay x += y; 339371c9d4SSatish Balay return old; 349371c9d4SSatish Balay } 359371c9d4SSatish Balay }; 369371c9d4SSatish Balay template <typename Type> 379371c9d4SSatish Balay struct Mult { 38d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const 39d71ae5a4SJacob Faibussowitsch { 409371c9d4SSatish Balay Type old = x; 419371c9d4SSatish Balay x *= y; 429371c9d4SSatish Balay return old; 439371c9d4SSatish Balay } 449371c9d4SSatish Balay }; 459371c9d4SSatish Balay template <typename Type> 469371c9d4SSatish Balay struct Min { 47d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const 48d71ae5a4SJacob Faibussowitsch { 499371c9d4SSatish Balay Type old = x; 509371c9d4SSatish Balay x = PetscMin(x, y); 519371c9d4SSatish Balay return old; 529371c9d4SSatish Balay } 539371c9d4SSatish Balay }; 549371c9d4SSatish Balay template <typename Type> 559371c9d4SSatish Balay struct Max { 56d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const 57d71ae5a4SJacob Faibussowitsch { 589371c9d4SSatish Balay Type old = x; 599371c9d4SSatish Balay x = PetscMax(x, y); 609371c9d4SSatish Balay return old; 619371c9d4SSatish Balay } 629371c9d4SSatish Balay }; 639371c9d4SSatish Balay template <typename Type> 649371c9d4SSatish Balay struct LAND { 65d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const 66d71ae5a4SJacob Faibussowitsch { 679371c9d4SSatish Balay Type old = x; 689371c9d4SSatish Balay x = x && y; 699371c9d4SSatish Balay return old; 709371c9d4SSatish Balay } 719371c9d4SSatish Balay }; 729371c9d4SSatish Balay template <typename Type> 739371c9d4SSatish Balay struct LOR { 74d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const 75d71ae5a4SJacob Faibussowitsch { 769371c9d4SSatish Balay Type old = x; 779371c9d4SSatish Balay x = x || y; 789371c9d4SSatish Balay return old; 799371c9d4SSatish Balay } 809371c9d4SSatish Balay }; 819371c9d4SSatish Balay template <typename Type> 829371c9d4SSatish Balay struct LXOR { 83d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const 84d71ae5a4SJacob Faibussowitsch { 859371c9d4SSatish Balay Type old = x; 869371c9d4SSatish Balay x = !x != !y; 879371c9d4SSatish Balay return old; 889371c9d4SSatish Balay } 899371c9d4SSatish Balay }; 909371c9d4SSatish Balay template <typename Type> 919371c9d4SSatish Balay struct BAND { 92d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const 93d71ae5a4SJacob Faibussowitsch { 949371c9d4SSatish Balay Type old = x; 959371c9d4SSatish Balay x = x & y; 969371c9d4SSatish Balay return old; 979371c9d4SSatish Balay } 989371c9d4SSatish Balay }; 999371c9d4SSatish Balay template <typename Type> 1009371c9d4SSatish Balay struct BOR { 101d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const 102d71ae5a4SJacob Faibussowitsch { 1039371c9d4SSatish Balay Type old = x; 1049371c9d4SSatish Balay x = x | y; 1059371c9d4SSatish Balay return old; 1069371c9d4SSatish Balay } 1079371c9d4SSatish Balay }; 1089371c9d4SSatish Balay template <typename Type> 1099371c9d4SSatish Balay struct BXOR { 110d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const 111d71ae5a4SJacob Faibussowitsch { 1129371c9d4SSatish Balay Type old = x; 1139371c9d4SSatish Balay x = x ^ y; 1149371c9d4SSatish Balay return old; 1159371c9d4SSatish Balay } 1169371c9d4SSatish Balay }; 1179371c9d4SSatish Balay template <typename PairType> 1189371c9d4SSatish Balay struct Minloc { 119d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION PairType operator()(PairType &x, PairType y) const 120d71ae5a4SJacob Faibussowitsch { 121914b7a73SJunchao Zhang PairType old = x; 122914b7a73SJunchao Zhang if (y.first < x.first) x = y; 123914b7a73SJunchao Zhang else if (y.first == x.first) x.second = PetscMin(x.second, y.second); 124914b7a73SJunchao Zhang return old; 125914b7a73SJunchao Zhang } 126914b7a73SJunchao Zhang }; 1279371c9d4SSatish Balay template <typename PairType> 1289371c9d4SSatish Balay struct Maxloc { 129d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION PairType operator()(PairType &x, PairType y) const 130d71ae5a4SJacob Faibussowitsch { 131914b7a73SJunchao Zhang PairType old = x; 132914b7a73SJunchao Zhang if (y.first > x.first) x = y; 133914b7a73SJunchao Zhang else if (y.first == x.first) x.second = PetscMin(x.second, y.second); /* See MPI MAXLOC */ 134914b7a73SJunchao Zhang return old; 135914b7a73SJunchao Zhang } 136914b7a73SJunchao Zhang }; 137914b7a73SJunchao Zhang 138914b7a73SJunchao Zhang /*====================================================================================*/ 139914b7a73SJunchao Zhang /* Atomic operations */ 140914b7a73SJunchao Zhang /*====================================================================================*/ 1419371c9d4SSatish Balay template <typename Type> 1429371c9d4SSatish Balay struct AtomicInsert { 1439371c9d4SSatish Balay KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_assign(&x, y); } 1449371c9d4SSatish Balay }; 1459371c9d4SSatish Balay template <typename Type> 1469371c9d4SSatish Balay struct AtomicAdd { 1479371c9d4SSatish Balay KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_add(&x, y); } 1489371c9d4SSatish Balay }; 1499371c9d4SSatish Balay template <typename Type> 1509371c9d4SSatish Balay struct AtomicBAND { 1519371c9d4SSatish Balay KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_and(&x, y); } 1529371c9d4SSatish Balay }; 1539371c9d4SSatish Balay template <typename Type> 1549371c9d4SSatish Balay struct AtomicBOR { 1559371c9d4SSatish Balay KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_or(&x, y); } 1569371c9d4SSatish Balay }; 1579371c9d4SSatish Balay template <typename Type> 1589371c9d4SSatish Balay struct AtomicBXOR { 1599371c9d4SSatish Balay KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_fetch_xor(&x, y); } 1609371c9d4SSatish Balay }; 1619371c9d4SSatish Balay template <typename Type> 1629371c9d4SSatish Balay struct AtomicLAND { 163d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const 164d71ae5a4SJacob Faibussowitsch { 1659371c9d4SSatish Balay const Type zero = 0, one = ~0; 1669371c9d4SSatish Balay Kokkos::atomic_and(&x, y ? one : zero); 1679371c9d4SSatish Balay } 1689371c9d4SSatish Balay }; 1699371c9d4SSatish Balay template <typename Type> 1709371c9d4SSatish Balay struct AtomicLOR { 171d71ae5a4SJacob Faibussowitsch KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const 172d71ae5a4SJacob Faibussowitsch { 1739371c9d4SSatish Balay const Type zero = 0, one = 1; 1749371c9d4SSatish Balay Kokkos::atomic_or(&x, y ? one : zero); 1759371c9d4SSatish Balay } 1769371c9d4SSatish Balay }; 1779371c9d4SSatish Balay template <typename Type> 1789371c9d4SSatish Balay struct AtomicMult { 1799371c9d4SSatish Balay KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_fetch_mul(&x, y); } 1809371c9d4SSatish Balay }; 1819371c9d4SSatish Balay template <typename Type> 1829371c9d4SSatish Balay struct AtomicMin { 1839371c9d4SSatish Balay KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_fetch_min(&x, y); } 1849371c9d4SSatish Balay }; 1859371c9d4SSatish Balay template <typename Type> 1869371c9d4SSatish Balay struct AtomicMax { 1879371c9d4SSatish Balay KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_fetch_max(&x, y); } 1889371c9d4SSatish Balay }; 189914b7a73SJunchao Zhang /* TODO: struct AtomicLXOR */ 1909371c9d4SSatish Balay template <typename Type> 1919371c9d4SSatish Balay struct AtomicFetchAdd { 1929371c9d4SSatish Balay KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const { return Kokkos::atomic_fetch_add(&x, y); } 1939371c9d4SSatish Balay }; 194914b7a73SJunchao Zhang 195914b7a73SJunchao Zhang /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */ 196d71ae5a4SJacob Faibussowitsch static KOKKOS_INLINE_FUNCTION PetscInt MapTidToIndex(const PetscInt *opt, PetscInt tid) 197d71ae5a4SJacob Faibussowitsch { 198914b7a73SJunchao Zhang PetscInt i, j, k, m, n, r; 199914b7a73SJunchao Zhang const PetscInt *offset, *start, *dx, *dy, *X, *Y; 200914b7a73SJunchao Zhang 201914b7a73SJunchao Zhang n = opt[0]; 202914b7a73SJunchao Zhang offset = opt + 1; 203914b7a73SJunchao Zhang start = opt + n + 2; 204914b7a73SJunchao Zhang dx = opt + 2 * n + 2; 205914b7a73SJunchao Zhang dy = opt + 3 * n + 2; 206914b7a73SJunchao Zhang X = opt + 5 * n + 2; 207914b7a73SJunchao Zhang Y = opt + 6 * n + 2; 2089371c9d4SSatish Balay for (r = 0; r < n; r++) { 2099371c9d4SSatish Balay if (tid < offset[r + 1]) break; 2109371c9d4SSatish Balay } 211914b7a73SJunchao Zhang m = (tid - offset[r]); 212914b7a73SJunchao Zhang k = m / (dx[r] * dy[r]); 213914b7a73SJunchao Zhang j = (m - k * dx[r] * dy[r]) / dx[r]; 214914b7a73SJunchao Zhang i = m - k * dx[r] * dy[r] - j * dx[r]; 215914b7a73SJunchao Zhang 216914b7a73SJunchao Zhang return (start[r] + k * X[r] * Y[r] + j * X[r] + i); 217914b7a73SJunchao Zhang } 218914b7a73SJunchao Zhang 219914b7a73SJunchao Zhang /*====================================================================================*/ 220914b7a73SJunchao Zhang /* Wrappers for Pack/Unpack/Scatter kernels. Function pointers are stored in 'link' */ 221914b7a73SJunchao Zhang /*====================================================================================*/ 222914b7a73SJunchao Zhang 223914b7a73SJunchao Zhang /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then 224914b7a73SJunchao Zhang <Type> is PetscReal, which is the primitive type we operate on. 225914b7a73SJunchao Zhang <bs> is 16, which says <unit> contains 16 primitive types. 226914b7a73SJunchao Zhang <BS> is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>. 227914b7a73SJunchao Zhang <EQ> is 0, which is (bs == BS ? 1 : 0) 228914b7a73SJunchao Zhang 229914b7a73SJunchao Zhang If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant. 230914b7a73SJunchao Zhang For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled. 231914b7a73SJunchao Zhang */ 232914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ> 233d71ae5a4SJacob Faibussowitsch static PetscErrorCode Pack(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *data_, void *buf_) 234d71ae5a4SJacob Faibussowitsch { 235914b7a73SJunchao Zhang const PetscInt *iopt = opt ? opt->array : NULL; 236914b7a73SJunchao Zhang const PetscInt M = EQ ? 1 : link->bs / BS, MBS = M * BS; /* If EQ, then MBS will be a compile-time const */ 237914b7a73SJunchao Zhang const Type *data = static_cast<const Type *>(data_); 238914b7a73SJunchao Zhang Type *buf = static_cast<Type *>(buf_); 239*524fe776SJunchao Zhang DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace(); 240914b7a73SJunchao Zhang 241914b7a73SJunchao Zhang PetscFunctionBegin; 2429371c9d4SSatish Balay Kokkos::parallel_for( 2439371c9d4SSatish Balay Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) { 244914b7a73SJunchao Zhang /* iopt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous; 245914b7a73SJunchao Zhang iopt == NULL && idx == NULL ==> the indices are contiguous; 246914b7a73SJunchao Zhang */ 247914b7a73SJunchao Zhang PetscInt t = (iopt ? MapTidToIndex(iopt, tid) : (idx ? idx[tid] : start + tid)) * MBS; 248914b7a73SJunchao Zhang PetscInt s = tid * MBS; 249914b7a73SJunchao Zhang for (int i = 0; i < MBS; i++) buf[s + i] = data[t + i]; 250914b7a73SJunchao Zhang }); 2513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 252914b7a73SJunchao Zhang } 253914b7a73SJunchao Zhang 254914b7a73SJunchao Zhang template <typename Type, class Op, PetscInt BS, PetscInt EQ> 255d71ae5a4SJacob Faibussowitsch static PetscErrorCode UnpackAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data_, const void *buf_) 256d71ae5a4SJacob Faibussowitsch { 257914b7a73SJunchao Zhang Op op; 258914b7a73SJunchao Zhang const PetscInt *iopt = opt ? opt->array : NULL; 259914b7a73SJunchao Zhang const PetscInt M = EQ ? 1 : link->bs / BS, MBS = M * BS; 260914b7a73SJunchao Zhang Type *data = static_cast<Type *>(data_); 261914b7a73SJunchao Zhang const Type *buf = static_cast<const Type *>(buf_); 262*524fe776SJunchao Zhang DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace(); 263914b7a73SJunchao Zhang 264914b7a73SJunchao Zhang PetscFunctionBegin; 2659371c9d4SSatish Balay Kokkos::parallel_for( 2669371c9d4SSatish Balay Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) { 267914b7a73SJunchao Zhang PetscInt t = (iopt ? MapTidToIndex(iopt, tid) : (idx ? idx[tid] : start + tid)) * MBS; 268914b7a73SJunchao Zhang PetscInt s = tid * MBS; 269914b7a73SJunchao Zhang for (int i = 0; i < MBS; i++) op(data[t + i], buf[s + i]); 270914b7a73SJunchao Zhang }); 2713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 272914b7a73SJunchao Zhang } 273914b7a73SJunchao Zhang 274914b7a73SJunchao Zhang template <typename Type, class Op, PetscInt BS, PetscInt EQ> 275d71ae5a4SJacob Faibussowitsch static PetscErrorCode FetchAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, void *buf) 276d71ae5a4SJacob Faibussowitsch { 277914b7a73SJunchao Zhang Op op; 278914b7a73SJunchao Zhang const PetscInt *ropt = opt ? opt->array : NULL; 279914b7a73SJunchao Zhang const PetscInt M = EQ ? 1 : link->bs / BS, MBS = M * BS; 280914b7a73SJunchao Zhang Type *rootdata = static_cast<Type *>(data), *leafbuf = static_cast<Type *>(buf); 281*524fe776SJunchao Zhang DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace(); 282914b7a73SJunchao Zhang 283914b7a73SJunchao Zhang PetscFunctionBegin; 2849371c9d4SSatish Balay Kokkos::parallel_for( 2859371c9d4SSatish Balay Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) { 286914b7a73SJunchao Zhang PetscInt r = (ropt ? MapTidToIndex(ropt, tid) : (idx ? idx[tid] : start + tid)) * MBS; 287914b7a73SJunchao Zhang PetscInt l = tid * MBS; 288914b7a73SJunchao Zhang for (int i = 0; i < MBS; i++) leafbuf[l + i] = op(rootdata[r + i], leafbuf[l + i]); 289914b7a73SJunchao Zhang }); 2903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 291914b7a73SJunchao Zhang } 292914b7a73SJunchao Zhang 293914b7a73SJunchao Zhang template <typename Type, class Op, PetscInt BS, PetscInt EQ> 294d71ae5a4SJacob Faibussowitsch static PetscErrorCode ScatterAndOp(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src_, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst_) 295d71ae5a4SJacob Faibussowitsch { 296914b7a73SJunchao Zhang PetscInt srcx = 0, srcy = 0, srcX = 0, srcY = 0, dstx = 0, dsty = 0, dstX = 0, dstY = 0; 297914b7a73SJunchao Zhang const PetscInt M = (EQ) ? 1 : link->bs / BS, MBS = M * BS; 298914b7a73SJunchao Zhang const Type *src = static_cast<const Type *>(src_); 299914b7a73SJunchao Zhang Type *dst = static_cast<Type *>(dst_); 300*524fe776SJunchao Zhang DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace(); 301914b7a73SJunchao Zhang 302914b7a73SJunchao Zhang PetscFunctionBegin; 303914b7a73SJunchao Zhang /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */ 3049371c9d4SSatish Balay if (srcOpt) { 3059371c9d4SSatish Balay srcx = srcOpt->dx[0]; 3069371c9d4SSatish Balay srcy = srcOpt->dy[0]; 3079371c9d4SSatish Balay srcX = srcOpt->X[0]; 3089371c9d4SSatish Balay srcY = srcOpt->Y[0]; 3099371c9d4SSatish Balay srcStart = srcOpt->start[0]; 3109371c9d4SSatish Balay srcIdx = NULL; 3119371c9d4SSatish Balay } else if (!srcIdx) { 3129371c9d4SSatish Balay srcx = srcX = count; 3139371c9d4SSatish Balay srcy = srcY = 1; 3149371c9d4SSatish Balay } 315914b7a73SJunchao Zhang 3169371c9d4SSatish Balay if (dstOpt) { 3179371c9d4SSatish Balay dstx = dstOpt->dx[0]; 3189371c9d4SSatish Balay dsty = dstOpt->dy[0]; 3199371c9d4SSatish Balay dstX = dstOpt->X[0]; 3209371c9d4SSatish Balay dstY = dstOpt->Y[0]; 3219371c9d4SSatish Balay dstStart = dstOpt->start[0]; 3229371c9d4SSatish Balay dstIdx = NULL; 3239371c9d4SSatish Balay } else if (!dstIdx) { 3249371c9d4SSatish Balay dstx = dstX = count; 3259371c9d4SSatish Balay dsty = dstY = 1; 3269371c9d4SSatish Balay } 327914b7a73SJunchao Zhang 3289371c9d4SSatish Balay Kokkos::parallel_for( 3299371c9d4SSatish Balay Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) { 330914b7a73SJunchao Zhang PetscInt i, j, k, s, t; 331914b7a73SJunchao Zhang Op op; 332914b7a73SJunchao Zhang if (!srcIdx) { /* src is in 3D */ 333914b7a73SJunchao Zhang k = tid / (srcx * srcy); 334914b7a73SJunchao Zhang j = (tid - k * srcx * srcy) / srcx; 335914b7a73SJunchao Zhang i = tid - k * srcx * srcy - j * srcx; 336914b7a73SJunchao Zhang s = srcStart + k * srcX * srcY + j * srcX + i; 337914b7a73SJunchao Zhang } else { /* src is contiguous */ 338914b7a73SJunchao Zhang s = srcIdx[tid]; 339914b7a73SJunchao Zhang } 340914b7a73SJunchao Zhang 341914b7a73SJunchao Zhang if (!dstIdx) { /* 3D */ 342914b7a73SJunchao Zhang k = tid / (dstx * dsty); 343914b7a73SJunchao Zhang j = (tid - k * dstx * dsty) / dstx; 344914b7a73SJunchao Zhang i = tid - k * dstx * dsty - j * dstx; 345914b7a73SJunchao Zhang t = dstStart + k * dstX * dstY + j * dstX + i; 346914b7a73SJunchao Zhang } else { /* contiguous */ 347914b7a73SJunchao Zhang t = dstIdx[tid]; 348914b7a73SJunchao Zhang } 349914b7a73SJunchao Zhang 350914b7a73SJunchao Zhang s *= MBS; 351914b7a73SJunchao Zhang t *= MBS; 352914b7a73SJunchao Zhang for (i = 0; i < MBS; i++) op(dst[t + i], src[s + i]); 353914b7a73SJunchao Zhang }); 3543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 355914b7a73SJunchao Zhang } 356914b7a73SJunchao Zhang 357914b7a73SJunchao Zhang /* Specialization for Insert since we may use memcpy */ 358914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ> 359d71ae5a4SJacob Faibussowitsch static PetscErrorCode ScatterAndInsert(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src_, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst_) 360d71ae5a4SJacob Faibussowitsch { 361914b7a73SJunchao Zhang const Type *src = static_cast<const Type *>(src_); 362914b7a73SJunchao Zhang Type *dst = static_cast<Type *>(dst_); 363*524fe776SJunchao Zhang DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace(); 364914b7a73SJunchao Zhang 365914b7a73SJunchao Zhang PetscFunctionBegin; 3663ba16761SJacob Faibussowitsch if (!count) PetscFunctionReturn(PETSC_SUCCESS); 367914b7a73SJunchao Zhang /*src and dst are contiguous */ 368914b7a73SJunchao Zhang if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) { 369914b7a73SJunchao Zhang size_t sz = count * link->unitbytes; 370914b7a73SJunchao Zhang deviceBuffer_t dbuf(reinterpret_cast<char *>(dst + dstStart * link->bs), sz); 371914b7a73SJunchao Zhang deviceConstBuffer_t sbuf(reinterpret_cast<const char *>(src + srcStart * link->bs), sz); 372914b7a73SJunchao Zhang Kokkos::deep_copy(exec, dbuf, sbuf); 373914b7a73SJunchao Zhang } else { 3749566063dSJacob Faibussowitsch PetscCall(ScatterAndOp<Type, Insert<Type>, BS, EQ>(link, count, srcStart, srcOpt, srcIdx, src, dstStart, dstOpt, dstIdx, dst)); 375914b7a73SJunchao Zhang } 3763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 377914b7a73SJunchao Zhang } 378914b7a73SJunchao Zhang 379914b7a73SJunchao Zhang template <typename Type, class Op, PetscInt BS, PetscInt EQ> 380d71ae5a4SJacob Faibussowitsch static PetscErrorCode FetchAndOpLocal(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata_, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata_, void *leafupdate_) 381d71ae5a4SJacob Faibussowitsch { 382914b7a73SJunchao Zhang Op op; 383914b7a73SJunchao Zhang const PetscInt M = (EQ) ? 1 : link->bs / BS, MBS = M * BS; 384914b7a73SJunchao Zhang const PetscInt *ropt = rootopt ? rootopt->array : NULL; 385914b7a73SJunchao Zhang const PetscInt *lopt = leafopt ? leafopt->array : NULL; 386914b7a73SJunchao Zhang Type *rootdata = static_cast<Type *>(rootdata_), *leafupdate = static_cast<Type *>(leafupdate_); 387914b7a73SJunchao Zhang const Type *leafdata = static_cast<const Type *>(leafdata_); 388*524fe776SJunchao Zhang DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace(); 389914b7a73SJunchao Zhang 390914b7a73SJunchao Zhang PetscFunctionBegin; 3919371c9d4SSatish Balay Kokkos::parallel_for( 3929371c9d4SSatish Balay Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) { 393914b7a73SJunchao Zhang PetscInt r = (ropt ? MapTidToIndex(ropt, tid) : (rootidx ? rootidx[tid] : rootstart + tid)) * MBS; 394914b7a73SJunchao Zhang PetscInt l = (lopt ? MapTidToIndex(lopt, tid) : (leafidx ? leafidx[tid] : leafstart + tid)) * MBS; 395914b7a73SJunchao Zhang for (int i = 0; i < MBS; i++) leafupdate[l + i] = op(rootdata[r + i], leafdata[l + i]); 396914b7a73SJunchao Zhang }); 3973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 398914b7a73SJunchao Zhang } 399914b7a73SJunchao Zhang 400914b7a73SJunchao Zhang /*====================================================================================*/ 401914b7a73SJunchao Zhang /* Init various types and instantiate pack/unpack function pointers */ 402914b7a73SJunchao Zhang /*====================================================================================*/ 403914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ> 404d71ae5a4SJacob Faibussowitsch static void PackInit_RealType(PetscSFLink link) 405d71ae5a4SJacob Faibussowitsch { 406914b7a73SJunchao Zhang /* Pack/unpack for remote communication */ 407914b7a73SJunchao Zhang link->d_Pack = Pack<Type, BS, EQ>; 408914b7a73SJunchao Zhang link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>; 409914b7a73SJunchao Zhang link->d_UnpackAndAdd = UnpackAndOp<Type, Add<Type>, BS, EQ>; 410914b7a73SJunchao Zhang link->d_UnpackAndMult = UnpackAndOp<Type, Mult<Type>, BS, EQ>; 411914b7a73SJunchao Zhang link->d_UnpackAndMin = UnpackAndOp<Type, Min<Type>, BS, EQ>; 412914b7a73SJunchao Zhang link->d_UnpackAndMax = UnpackAndOp<Type, Max<Type>, BS, EQ>; 413914b7a73SJunchao Zhang link->d_FetchAndAdd = FetchAndOp<Type, Add<Type>, BS, EQ>; 414914b7a73SJunchao Zhang /* Scatter for local communication */ 415914b7a73SJunchao Zhang link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>; /* Has special optimizations */ 416914b7a73SJunchao Zhang link->d_ScatterAndAdd = ScatterAndOp<Type, Add<Type>, BS, EQ>; 417914b7a73SJunchao Zhang link->d_ScatterAndMult = ScatterAndOp<Type, Mult<Type>, BS, EQ>; 418914b7a73SJunchao Zhang link->d_ScatterAndMin = ScatterAndOp<Type, Min<Type>, BS, EQ>; 419914b7a73SJunchao Zhang link->d_ScatterAndMax = ScatterAndOp<Type, Max<Type>, BS, EQ>; 420914b7a73SJunchao Zhang link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>; 421914b7a73SJunchao Zhang /* Atomic versions when there are data-race possibilities */ 422914b7a73SJunchao Zhang link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>; 423914b7a73SJunchao Zhang link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>; 424914b7a73SJunchao Zhang link->da_UnpackAndMult = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>; 425914b7a73SJunchao Zhang link->da_UnpackAndMin = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>; 426914b7a73SJunchao Zhang link->da_UnpackAndMax = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>; 427914b7a73SJunchao Zhang link->da_FetchAndAdd = FetchAndOp<Type, AtomicFetchAdd<Type>, BS, EQ>; 428914b7a73SJunchao Zhang 429914b7a73SJunchao Zhang link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>; 430914b7a73SJunchao Zhang link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>; 431914b7a73SJunchao Zhang link->da_ScatterAndMult = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>; 432914b7a73SJunchao Zhang link->da_ScatterAndMin = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>; 433914b7a73SJunchao Zhang link->da_ScatterAndMax = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>; 434914b7a73SJunchao Zhang link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicFetchAdd<Type>, BS, EQ>; 435914b7a73SJunchao Zhang } 436914b7a73SJunchao Zhang 437914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ> 438d71ae5a4SJacob Faibussowitsch static void PackInit_IntegerType(PetscSFLink link) 439d71ae5a4SJacob Faibussowitsch { 440914b7a73SJunchao Zhang link->d_Pack = Pack<Type, BS, EQ>; 441914b7a73SJunchao Zhang link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>; 442914b7a73SJunchao Zhang link->d_UnpackAndAdd = UnpackAndOp<Type, Add<Type>, BS, EQ>; 443914b7a73SJunchao Zhang link->d_UnpackAndMult = UnpackAndOp<Type, Mult<Type>, BS, EQ>; 444914b7a73SJunchao Zhang link->d_UnpackAndMin = UnpackAndOp<Type, Min<Type>, BS, EQ>; 445914b7a73SJunchao Zhang link->d_UnpackAndMax = UnpackAndOp<Type, Max<Type>, BS, EQ>; 446914b7a73SJunchao Zhang link->d_UnpackAndLAND = UnpackAndOp<Type, LAND<Type>, BS, EQ>; 447914b7a73SJunchao Zhang link->d_UnpackAndLOR = UnpackAndOp<Type, LOR<Type>, BS, EQ>; 448914b7a73SJunchao Zhang link->d_UnpackAndLXOR = UnpackAndOp<Type, LXOR<Type>, BS, EQ>; 449914b7a73SJunchao Zhang link->d_UnpackAndBAND = UnpackAndOp<Type, BAND<Type>, BS, EQ>; 450914b7a73SJunchao Zhang link->d_UnpackAndBOR = UnpackAndOp<Type, BOR<Type>, BS, EQ>; 451914b7a73SJunchao Zhang link->d_UnpackAndBXOR = UnpackAndOp<Type, BXOR<Type>, BS, EQ>; 452914b7a73SJunchao Zhang link->d_FetchAndAdd = FetchAndOp<Type, Add<Type>, BS, EQ>; 453914b7a73SJunchao Zhang 454914b7a73SJunchao Zhang link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>; 455914b7a73SJunchao Zhang link->d_ScatterAndAdd = ScatterAndOp<Type, Add<Type>, BS, EQ>; 456914b7a73SJunchao Zhang link->d_ScatterAndMult = ScatterAndOp<Type, Mult<Type>, BS, EQ>; 457914b7a73SJunchao Zhang link->d_ScatterAndMin = ScatterAndOp<Type, Min<Type>, BS, EQ>; 458914b7a73SJunchao Zhang link->d_ScatterAndMax = ScatterAndOp<Type, Max<Type>, BS, EQ>; 459914b7a73SJunchao Zhang link->d_ScatterAndLAND = ScatterAndOp<Type, LAND<Type>, BS, EQ>; 460914b7a73SJunchao Zhang link->d_ScatterAndLOR = ScatterAndOp<Type, LOR<Type>, BS, EQ>; 461914b7a73SJunchao Zhang link->d_ScatterAndLXOR = ScatterAndOp<Type, LXOR<Type>, BS, EQ>; 462914b7a73SJunchao Zhang link->d_ScatterAndBAND = ScatterAndOp<Type, BAND<Type>, BS, EQ>; 463914b7a73SJunchao Zhang link->d_ScatterAndBOR = ScatterAndOp<Type, BOR<Type>, BS, EQ>; 464914b7a73SJunchao Zhang link->d_ScatterAndBXOR = ScatterAndOp<Type, BXOR<Type>, BS, EQ>; 465914b7a73SJunchao Zhang link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>; 466914b7a73SJunchao Zhang 467914b7a73SJunchao Zhang link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>; 468914b7a73SJunchao Zhang link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>; 469914b7a73SJunchao Zhang link->da_UnpackAndMult = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>; 470914b7a73SJunchao Zhang link->da_UnpackAndMin = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>; 471914b7a73SJunchao Zhang link->da_UnpackAndMax = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>; 472914b7a73SJunchao Zhang link->da_UnpackAndLAND = UnpackAndOp<Type, AtomicLAND<Type>, BS, EQ>; 473914b7a73SJunchao Zhang link->da_UnpackAndLOR = UnpackAndOp<Type, AtomicLOR<Type>, BS, EQ>; 474914b7a73SJunchao Zhang link->da_UnpackAndBAND = UnpackAndOp<Type, AtomicBAND<Type>, BS, EQ>; 475914b7a73SJunchao Zhang link->da_UnpackAndBOR = UnpackAndOp<Type, AtomicBOR<Type>, BS, EQ>; 476914b7a73SJunchao Zhang link->da_UnpackAndBXOR = UnpackAndOp<Type, AtomicBXOR<Type>, BS, EQ>; 477914b7a73SJunchao Zhang link->da_FetchAndAdd = FetchAndOp<Type, AtomicFetchAdd<Type>, BS, EQ>; 478914b7a73SJunchao Zhang 479914b7a73SJunchao Zhang link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>; 480914b7a73SJunchao Zhang link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>; 481914b7a73SJunchao Zhang link->da_ScatterAndMult = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>; 482914b7a73SJunchao Zhang link->da_ScatterAndMin = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>; 483914b7a73SJunchao Zhang link->da_ScatterAndMax = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>; 484914b7a73SJunchao Zhang link->da_ScatterAndLAND = ScatterAndOp<Type, AtomicLAND<Type>, BS, EQ>; 485914b7a73SJunchao Zhang link->da_ScatterAndLOR = ScatterAndOp<Type, AtomicLOR<Type>, BS, EQ>; 486914b7a73SJunchao Zhang link->da_ScatterAndBAND = ScatterAndOp<Type, AtomicBAND<Type>, BS, EQ>; 487914b7a73SJunchao Zhang link->da_ScatterAndBOR = ScatterAndOp<Type, AtomicBOR<Type>, BS, EQ>; 488914b7a73SJunchao Zhang link->da_ScatterAndBXOR = ScatterAndOp<Type, AtomicBXOR<Type>, BS, EQ>; 489914b7a73SJunchao Zhang link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicFetchAdd<Type>, BS, EQ>; 490914b7a73SJunchao Zhang } 491914b7a73SJunchao Zhang 492914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 493914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ> 494d71ae5a4SJacob Faibussowitsch static void PackInit_ComplexType(PetscSFLink link) 495d71ae5a4SJacob Faibussowitsch { 496914b7a73SJunchao Zhang link->d_Pack = Pack<Type, BS, EQ>; 497914b7a73SJunchao Zhang link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>; 498914b7a73SJunchao Zhang link->d_UnpackAndAdd = UnpackAndOp<Type, Add<Type>, BS, EQ>; 499914b7a73SJunchao Zhang link->d_UnpackAndMult = UnpackAndOp<Type, Mult<Type>, BS, EQ>; 500914b7a73SJunchao Zhang link->d_FetchAndAdd = FetchAndOp<Type, Add<Type>, BS, EQ>; 501914b7a73SJunchao Zhang 502914b7a73SJunchao Zhang link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>; 503914b7a73SJunchao Zhang link->d_ScatterAndAdd = ScatterAndOp<Type, Add<Type>, BS, EQ>; 504914b7a73SJunchao Zhang link->d_ScatterAndMult = ScatterAndOp<Type, Mult<Type>, BS, EQ>; 505914b7a73SJunchao Zhang link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>; 506914b7a73SJunchao Zhang 507914b7a73SJunchao Zhang link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>; 508914b7a73SJunchao Zhang link->da_UnpackAndAdd = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>; 509914b7a73SJunchao Zhang link->da_UnpackAndMult = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>; 510914b7a73SJunchao Zhang link->da_FetchAndAdd = FetchAndOp<Type, AtomicFetchAdd<Type>, BS, EQ>; 511914b7a73SJunchao Zhang 512914b7a73SJunchao Zhang link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>; 513914b7a73SJunchao Zhang link->da_ScatterAndAdd = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>; 514914b7a73SJunchao Zhang link->da_ScatterAndMult = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>; 515914b7a73SJunchao Zhang link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicFetchAdd<Type>, BS, EQ>; 516914b7a73SJunchao Zhang } 517914b7a73SJunchao Zhang #endif 518914b7a73SJunchao Zhang 519914b7a73SJunchao Zhang template <typename Type> 520d71ae5a4SJacob Faibussowitsch static void PackInit_PairType(PetscSFLink link) 521d71ae5a4SJacob Faibussowitsch { 522914b7a73SJunchao Zhang link->d_Pack = Pack<Type, 1, 1>; 523914b7a73SJunchao Zhang link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, 1, 1>; 524914b7a73SJunchao Zhang link->d_UnpackAndMaxloc = UnpackAndOp<Type, Maxloc<Type>, 1, 1>; 525914b7a73SJunchao Zhang link->d_UnpackAndMinloc = UnpackAndOp<Type, Minloc<Type>, 1, 1>; 526914b7a73SJunchao Zhang 527914b7a73SJunchao Zhang link->d_ScatterAndInsert = ScatterAndOp<Type, Insert<Type>, 1, 1>; 528914b7a73SJunchao Zhang link->d_ScatterAndMaxloc = ScatterAndOp<Type, Maxloc<Type>, 1, 1>; 529914b7a73SJunchao Zhang link->d_ScatterAndMinloc = ScatterAndOp<Type, Minloc<Type>, 1, 1>; 530914b7a73SJunchao Zhang /* Atomics for pair types are not implemented yet */ 531914b7a73SJunchao Zhang } 532914b7a73SJunchao Zhang 533914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ> 534d71ae5a4SJacob Faibussowitsch static void PackInit_DumbType(PetscSFLink link) 535d71ae5a4SJacob Faibussowitsch { 536914b7a73SJunchao Zhang link->d_Pack = Pack<Type, BS, EQ>; 537914b7a73SJunchao Zhang link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>; 538914b7a73SJunchao Zhang link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>; 539914b7a73SJunchao Zhang /* Atomics for dumb types are not implemented yet */ 540914b7a73SJunchao Zhang } 541914b7a73SJunchao Zhang 542f4af43b4SJunchao Zhang /* 543f4af43b4SJunchao Zhang Kokkos::DefaultExecutionSpace(stream) is a reference counted pointer object. It has a bug 544f4af43b4SJunchao Zhang that one is not able to repeatedly create and destroy the object. SF's original design was each 545f4af43b4SJunchao Zhang SFLink has a stream (NULL or not) and hence an execution space object. The bug prevents us from 546f4af43b4SJunchao Zhang destroying multiple SFLinks with NULL stream and the default execution space object. To avoid 547f4af43b4SJunchao Zhang memory leaks, SF_Kokkos only supports NULL stream, which is also petsc's default scheme. SF_Kokkos 548f4af43b4SJunchao Zhang does not do its own new/delete. It just uses Kokkos::DefaultExecutionSpace(), which is a singliton 549f4af43b4SJunchao Zhang object in Kokkos. 550f4af43b4SJunchao Zhang */ 551f4af43b4SJunchao Zhang /* 552914b7a73SJunchao Zhang static PetscErrorCode PetscSFLinkDestroy_Kokkos(PetscSFLink link) 553914b7a73SJunchao Zhang { 554914b7a73SJunchao Zhang PetscFunctionBegin; 5553ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 556914b7a73SJunchao Zhang } 557f4af43b4SJunchao Zhang */ 558914b7a73SJunchao Zhang 559914b7a73SJunchao Zhang /* Some device-specific utilities */ 560d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkSyncDevice_Kokkos(PetscSFLink PETSC_UNUSED link) 561d71ae5a4SJacob Faibussowitsch { 562914b7a73SJunchao Zhang PetscFunctionBegin; 563914b7a73SJunchao Zhang Kokkos::fence(); 5643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 565914b7a73SJunchao Zhang } 566914b7a73SJunchao Zhang 567d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkSyncStream_Kokkos(PetscSFLink PETSC_UNUSED link) 568d71ae5a4SJacob Faibussowitsch { 569*524fe776SJunchao Zhang DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace(); 570914b7a73SJunchao Zhang PetscFunctionBegin; 571914b7a73SJunchao Zhang exec.fence(); 5723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 573914b7a73SJunchao Zhang } 574914b7a73SJunchao Zhang 575d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkMemcpy_Kokkos(PetscSFLink PETSC_UNUSED link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n) 576d71ae5a4SJacob Faibussowitsch { 577*524fe776SJunchao Zhang DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace(); 578914b7a73SJunchao Zhang 579914b7a73SJunchao Zhang PetscFunctionBegin; 5803ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 58171438e86SJunchao Zhang if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) { 5829566063dSJacob Faibussowitsch PetscCall(PetscMemcpy(dst, src, n)); 583914b7a73SJunchao Zhang } else { 58471438e86SJunchao Zhang if (PetscMemTypeDevice(dstmtype) && PetscMemTypeHost(srcmtype)) { 585914b7a73SJunchao Zhang deviceBuffer_t dbuf(static_cast<char *>(dst), n); 586914b7a73SJunchao Zhang HostConstBuffer_t sbuf(static_cast<const char *>(src), n); 587914b7a73SJunchao Zhang Kokkos::deep_copy(exec, dbuf, sbuf); 5889566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n)); 58971438e86SJunchao Zhang } else if (PetscMemTypeHost(dstmtype) && PetscMemTypeDevice(srcmtype)) { 590914b7a73SJunchao Zhang HostBuffer_t dbuf(static_cast<char *>(dst), n); 591914b7a73SJunchao Zhang deviceConstBuffer_t sbuf(static_cast<const char *>(src), n); 592914b7a73SJunchao Zhang Kokkos::deep_copy(exec, dbuf, sbuf); 5939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(n)); 59471438e86SJunchao Zhang } else if (PetscMemTypeDevice(dstmtype) && PetscMemTypeDevice(srcmtype)) { 595914b7a73SJunchao Zhang deviceBuffer_t dbuf(static_cast<char *>(dst), n); 596914b7a73SJunchao Zhang deviceConstBuffer_t sbuf(static_cast<const char *>(src), n); 597914b7a73SJunchao Zhang Kokkos::deep_copy(exec, dbuf, sbuf); 598914b7a73SJunchao Zhang } 599914b7a73SJunchao Zhang } 6003ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 601914b7a73SJunchao Zhang } 602914b7a73SJunchao Zhang 603d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFMalloc_Kokkos(PetscMemType mtype, size_t size, void **ptr) 604d71ae5a4SJacob Faibussowitsch { 605914b7a73SJunchao Zhang PetscFunctionBegin; 6069566063dSJacob Faibussowitsch if (PetscMemTypeHost(mtype)) PetscCall(PetscMalloc(size, ptr)); 60771438e86SJunchao Zhang else if (PetscMemTypeDevice(mtype)) { 6089566063dSJacob Faibussowitsch if (!PetscKokkosInitialized) PetscCall(PetscKokkosInitializeCheck()); 60945639126SStefano Zampini *ptr = Kokkos::kokkos_malloc<DeviceMemorySpace>(size); 61098921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype); 6113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 612914b7a73SJunchao Zhang } 613914b7a73SJunchao Zhang 614d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFFree_Kokkos(PetscMemType mtype, void *ptr) 615d71ae5a4SJacob Faibussowitsch { 616914b7a73SJunchao Zhang PetscFunctionBegin; 6179566063dSJacob Faibussowitsch if (PetscMemTypeHost(mtype)) PetscCall(PetscFree(ptr)); 6189371c9d4SSatish Balay else if (PetscMemTypeDevice(mtype)) { 6199371c9d4SSatish Balay Kokkos::kokkos_free<DeviceMemorySpace>(ptr); 6209371c9d4SSatish Balay } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype); 6213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 622914b7a73SJunchao Zhang } 623914b7a73SJunchao Zhang 62471438e86SJunchao Zhang /* Destructor when the link uses MPI for communication */ 625d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkDestroy_Kokkos(PetscSF sf, PetscSFLink link) 626d71ae5a4SJacob Faibussowitsch { 62771438e86SJunchao Zhang PetscFunctionBegin; 62871438e86SJunchao Zhang for (int i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 6299566063dSJacob Faibussowitsch PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE])); 6309566063dSJacob Faibussowitsch PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE])); 63171438e86SJunchao Zhang } 6323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 63371438e86SJunchao Zhang } 634914b7a73SJunchao Zhang 635914b7a73SJunchao Zhang /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */ 636d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF PETSC_UNUSED sf, PetscSFLink link, MPI_Datatype unit) 637d71ae5a4SJacob Faibussowitsch { 638914b7a73SJunchao Zhang PetscInt nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0; 639914b7a73SJunchao Zhang PetscBool is2Int, is2PetscInt; 640914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 641914b7a73SJunchao Zhang PetscInt nPetscComplex = 0; 642914b7a73SJunchao Zhang #endif 643914b7a73SJunchao Zhang 644914b7a73SJunchao Zhang PetscFunctionBegin; 6453ba16761SJacob Faibussowitsch if (link->deviceinited) PetscFunctionReturn(PETSC_SUCCESS); 6469566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 6479566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar)); 6489566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar)); 649914b7a73SJunchao Zhang /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ 6509566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt)); 6519566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt)); 6529566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal)); 653914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 6549566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex)); 655914b7a73SJunchao Zhang #endif 6569566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int)); 6579566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt)); 658914b7a73SJunchao Zhang 659914b7a73SJunchao Zhang if (is2Int) { 660914b7a73SJunchao Zhang PackInit_PairType<Kokkos::pair<int, int>>(link); 661914b7a73SJunchao Zhang } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ 662914b7a73SJunchao Zhang PackInit_PairType<Kokkos::pair<PetscInt, PetscInt>>(link); 663914b7a73SJunchao Zhang } else if (nPetscReal) { 664d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE) /* Skip the unimportant stuff to speed up SF device compilation time */ 6659371c9d4SSatish Balay if (nPetscReal == 8) PackInit_RealType<PetscReal, 8, 1>(link); 6669371c9d4SSatish Balay else if (nPetscReal % 8 == 0) PackInit_RealType<PetscReal, 8, 0>(link); 6679371c9d4SSatish Balay else if (nPetscReal == 4) PackInit_RealType<PetscReal, 4, 1>(link); 6689371c9d4SSatish Balay else if (nPetscReal % 4 == 0) PackInit_RealType<PetscReal, 4, 0>(link); 6699371c9d4SSatish Balay else if (nPetscReal == 2) PackInit_RealType<PetscReal, 2, 1>(link); 6709371c9d4SSatish Balay else if (nPetscReal % 2 == 0) PackInit_RealType<PetscReal, 2, 0>(link); 6719371c9d4SSatish Balay else if (nPetscReal == 1) PackInit_RealType<PetscReal, 1, 1>(link); 6729371c9d4SSatish Balay else if (nPetscReal % 1 == 0) 673eee4e20aSJunchao Zhang #endif 674d941a2f0SJunchao Zhang PackInit_RealType<PetscReal, 1, 0>(link); 675874d28e3SJunchao Zhang } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) { 676d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE) 6779371c9d4SSatish Balay if (nPetscInt == 8) PackInit_IntegerType<llint, 8, 1>(link); 6789371c9d4SSatish Balay else if (nPetscInt % 8 == 0) PackInit_IntegerType<llint, 8, 0>(link); 6799371c9d4SSatish Balay else if (nPetscInt == 4) PackInit_IntegerType<llint, 4, 1>(link); 6809371c9d4SSatish Balay else if (nPetscInt % 4 == 0) PackInit_IntegerType<llint, 4, 0>(link); 6819371c9d4SSatish Balay else if (nPetscInt == 2) PackInit_IntegerType<llint, 2, 1>(link); 6829371c9d4SSatish Balay else if (nPetscInt % 2 == 0) PackInit_IntegerType<llint, 2, 0>(link); 6839371c9d4SSatish Balay else if (nPetscInt == 1) PackInit_IntegerType<llint, 1, 1>(link); 6849371c9d4SSatish Balay else if (nPetscInt % 1 == 0) 685eee4e20aSJunchao Zhang #endif 686d941a2f0SJunchao Zhang PackInit_IntegerType<llint, 1, 0>(link); 687914b7a73SJunchao Zhang } else if (nInt) { 688d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE) 6899371c9d4SSatish Balay if (nInt == 8) PackInit_IntegerType<int, 8, 1>(link); 6909371c9d4SSatish Balay else if (nInt % 8 == 0) PackInit_IntegerType<int, 8, 0>(link); 6919371c9d4SSatish Balay else if (nInt == 4) PackInit_IntegerType<int, 4, 1>(link); 6929371c9d4SSatish Balay else if (nInt % 4 == 0) PackInit_IntegerType<int, 4, 0>(link); 6939371c9d4SSatish Balay else if (nInt == 2) PackInit_IntegerType<int, 2, 1>(link); 6949371c9d4SSatish Balay else if (nInt % 2 == 0) PackInit_IntegerType<int, 2, 0>(link); 6959371c9d4SSatish Balay else if (nInt == 1) PackInit_IntegerType<int, 1, 1>(link); 6969371c9d4SSatish Balay else if (nInt % 1 == 0) 697eee4e20aSJunchao Zhang #endif 698d941a2f0SJunchao Zhang PackInit_IntegerType<int, 1, 0>(link); 699914b7a73SJunchao Zhang } else if (nSignedChar) { 700d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE) 7019371c9d4SSatish Balay if (nSignedChar == 8) PackInit_IntegerType<char, 8, 1>(link); 7029371c9d4SSatish Balay else if (nSignedChar % 8 == 0) PackInit_IntegerType<char, 8, 0>(link); 7039371c9d4SSatish Balay else if (nSignedChar == 4) PackInit_IntegerType<char, 4, 1>(link); 7049371c9d4SSatish Balay else if (nSignedChar % 4 == 0) PackInit_IntegerType<char, 4, 0>(link); 7059371c9d4SSatish Balay else if (nSignedChar == 2) PackInit_IntegerType<char, 2, 1>(link); 7069371c9d4SSatish Balay else if (nSignedChar % 2 == 0) PackInit_IntegerType<char, 2, 0>(link); 7079371c9d4SSatish Balay else if (nSignedChar == 1) PackInit_IntegerType<char, 1, 1>(link); 7089371c9d4SSatish Balay else if (nSignedChar % 1 == 0) 709eee4e20aSJunchao Zhang #endif 710d941a2f0SJunchao Zhang PackInit_IntegerType<char, 1, 0>(link); 711914b7a73SJunchao Zhang } else if (nUnsignedChar) { 712d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE) 7139371c9d4SSatish Balay if (nUnsignedChar == 8) PackInit_IntegerType<unsigned char, 8, 1>(link); 7149371c9d4SSatish Balay else if (nUnsignedChar % 8 == 0) PackInit_IntegerType<unsigned char, 8, 0>(link); 7159371c9d4SSatish Balay else if (nUnsignedChar == 4) PackInit_IntegerType<unsigned char, 4, 1>(link); 7169371c9d4SSatish Balay else if (nUnsignedChar % 4 == 0) PackInit_IntegerType<unsigned char, 4, 0>(link); 7179371c9d4SSatish Balay else if (nUnsignedChar == 2) PackInit_IntegerType<unsigned char, 2, 1>(link); 7189371c9d4SSatish Balay else if (nUnsignedChar % 2 == 0) PackInit_IntegerType<unsigned char, 2, 0>(link); 7199371c9d4SSatish Balay else if (nUnsignedChar == 1) PackInit_IntegerType<unsigned char, 1, 1>(link); 7209371c9d4SSatish Balay else if (nUnsignedChar % 1 == 0) 721eee4e20aSJunchao Zhang #endif 722d941a2f0SJunchao Zhang PackInit_IntegerType<unsigned char, 1, 0>(link); 723914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX) 724914b7a73SJunchao Zhang } else if (nPetscComplex) { 725d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE) 7269371c9d4SSatish Balay if (nPetscComplex == 8) PackInit_ComplexType<Kokkos::complex<PetscReal>, 8, 1>(link); 7279371c9d4SSatish Balay else if (nPetscComplex % 8 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>, 8, 0>(link); 7289371c9d4SSatish Balay else if (nPetscComplex == 4) PackInit_ComplexType<Kokkos::complex<PetscReal>, 4, 1>(link); 7299371c9d4SSatish Balay else if (nPetscComplex % 4 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>, 4, 0>(link); 7309371c9d4SSatish Balay else if (nPetscComplex == 2) PackInit_ComplexType<Kokkos::complex<PetscReal>, 2, 1>(link); 7319371c9d4SSatish Balay else if (nPetscComplex % 2 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>, 2, 0>(link); 7329371c9d4SSatish Balay else if (nPetscComplex == 1) PackInit_ComplexType<Kokkos::complex<PetscReal>, 1, 1>(link); 7339371c9d4SSatish Balay else if (nPetscComplex % 1 == 0) 734eee4e20aSJunchao Zhang #endif 735d941a2f0SJunchao Zhang PackInit_ComplexType<Kokkos::complex<PetscReal>, 1, 0>(link); 736914b7a73SJunchao Zhang #endif 737914b7a73SJunchao Zhang } else { 738914b7a73SJunchao Zhang MPI_Aint lb, nbyte; 7399566063dSJacob Faibussowitsch PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte)); 74008401ef6SPierre Jolivet PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb); 741914b7a73SJunchao Zhang if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ 742d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE) 7439371c9d4SSatish Balay if (nbyte == 4) PackInit_DumbType<char, 4, 1>(link); 7449371c9d4SSatish Balay else if (nbyte % 4 == 0) PackInit_DumbType<char, 4, 0>(link); 7459371c9d4SSatish Balay else if (nbyte == 2) PackInit_DumbType<char, 2, 1>(link); 7469371c9d4SSatish Balay else if (nbyte % 2 == 0) PackInit_DumbType<char, 2, 0>(link); 7479371c9d4SSatish Balay else if (nbyte == 1) PackInit_DumbType<char, 1, 1>(link); 7489371c9d4SSatish Balay else if (nbyte % 1 == 0) 749eee4e20aSJunchao Zhang #endif 750d941a2f0SJunchao Zhang PackInit_DumbType<char, 1, 0>(link); 751914b7a73SJunchao Zhang } else { 752914b7a73SJunchao Zhang nInt = nbyte / sizeof(int); 753d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE) 7549371c9d4SSatish Balay if (nInt == 8) PackInit_DumbType<int, 8, 1>(link); 7559371c9d4SSatish Balay else if (nInt % 8 == 0) PackInit_DumbType<int, 8, 0>(link); 7569371c9d4SSatish Balay else if (nInt == 4) PackInit_DumbType<int, 4, 1>(link); 7579371c9d4SSatish Balay else if (nInt % 4 == 0) PackInit_DumbType<int, 4, 0>(link); 7589371c9d4SSatish Balay else if (nInt == 2) PackInit_DumbType<int, 2, 1>(link); 7599371c9d4SSatish Balay else if (nInt % 2 == 0) PackInit_DumbType<int, 2, 0>(link); 7609371c9d4SSatish Balay else if (nInt == 1) PackInit_DumbType<int, 1, 1>(link); 7619371c9d4SSatish Balay else if (nInt % 1 == 0) 762eee4e20aSJunchao Zhang #endif 763d941a2f0SJunchao Zhang PackInit_DumbType<int, 1, 0>(link); 764914b7a73SJunchao Zhang } 765914b7a73SJunchao Zhang } 766914b7a73SJunchao Zhang 76771438e86SJunchao Zhang link->SyncDevice = PetscSFLinkSyncDevice_Kokkos; 76871438e86SJunchao Zhang link->SyncStream = PetscSFLinkSyncStream_Kokkos; 76920c24465SJunchao Zhang link->Memcpy = PetscSFLinkMemcpy_Kokkos; 77071438e86SJunchao Zhang link->Destroy = PetscSFLinkDestroy_Kokkos; 771914b7a73SJunchao Zhang link->deviceinited = PETSC_TRUE; 7723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 773914b7a73SJunchao Zhang } 774