xref: /petsc/src/vec/is/sf/impls/basic/kokkos/sfkok.kokkos.cxx (revision 524fe776c8aa733ff2ef43b738fa4e354b69f6ec)
1914b7a73SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
2914b7a73SJunchao Zhang 
3*524fe776SJunchao Zhang #include <petsc_kokkos.hpp>
4914b7a73SJunchao Zhang 
5914b7a73SJunchao Zhang using DeviceExecutionSpace = Kokkos::DefaultExecutionSpace;
6914b7a73SJunchao Zhang using DeviceMemorySpace    = typename DeviceExecutionSpace::memory_space;
7914b7a73SJunchao Zhang using HostMemorySpace      = Kokkos::HostSpace;
8914b7a73SJunchao Zhang 
9914b7a73SJunchao Zhang typedef Kokkos::View<char *, DeviceMemorySpace> deviceBuffer_t;
10914b7a73SJunchao Zhang typedef Kokkos::View<char *, HostMemorySpace>   HostBuffer_t;
11914b7a73SJunchao Zhang 
12914b7a73SJunchao Zhang typedef Kokkos::View<const char *, DeviceMemorySpace> deviceConstBuffer_t;
13914b7a73SJunchao Zhang typedef Kokkos::View<const char *, HostMemorySpace>   HostConstBuffer_t;
14914b7a73SJunchao Zhang 
15914b7a73SJunchao Zhang /*====================================================================================*/
16914b7a73SJunchao Zhang /*                             Regular operations                           */
17914b7a73SJunchao Zhang /*====================================================================================*/
189371c9d4SSatish Balay template <typename Type>
199371c9d4SSatish Balay struct Insert {
20d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
21d71ae5a4SJacob Faibussowitsch   {
229371c9d4SSatish Balay     Type old = x;
239371c9d4SSatish Balay     x        = y;
249371c9d4SSatish Balay     return old;
259371c9d4SSatish Balay   }
269371c9d4SSatish Balay };
279371c9d4SSatish Balay template <typename Type>
289371c9d4SSatish Balay struct Add {
29d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
30d71ae5a4SJacob Faibussowitsch   {
319371c9d4SSatish Balay     Type old = x;
329371c9d4SSatish Balay     x += y;
339371c9d4SSatish Balay     return old;
349371c9d4SSatish Balay   }
359371c9d4SSatish Balay };
369371c9d4SSatish Balay template <typename Type>
379371c9d4SSatish Balay struct Mult {
38d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
39d71ae5a4SJacob Faibussowitsch   {
409371c9d4SSatish Balay     Type old = x;
419371c9d4SSatish Balay     x *= y;
429371c9d4SSatish Balay     return old;
439371c9d4SSatish Balay   }
449371c9d4SSatish Balay };
459371c9d4SSatish Balay template <typename Type>
469371c9d4SSatish Balay struct Min {
47d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
48d71ae5a4SJacob Faibussowitsch   {
499371c9d4SSatish Balay     Type old = x;
509371c9d4SSatish Balay     x        = PetscMin(x, y);
519371c9d4SSatish Balay     return old;
529371c9d4SSatish Balay   }
539371c9d4SSatish Balay };
549371c9d4SSatish Balay template <typename Type>
559371c9d4SSatish Balay struct Max {
56d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
57d71ae5a4SJacob Faibussowitsch   {
589371c9d4SSatish Balay     Type old = x;
599371c9d4SSatish Balay     x        = PetscMax(x, y);
609371c9d4SSatish Balay     return old;
619371c9d4SSatish Balay   }
629371c9d4SSatish Balay };
639371c9d4SSatish Balay template <typename Type>
649371c9d4SSatish Balay struct LAND {
65d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
66d71ae5a4SJacob Faibussowitsch   {
679371c9d4SSatish Balay     Type old = x;
689371c9d4SSatish Balay     x        = x && y;
699371c9d4SSatish Balay     return old;
709371c9d4SSatish Balay   }
719371c9d4SSatish Balay };
729371c9d4SSatish Balay template <typename Type>
739371c9d4SSatish Balay struct LOR {
74d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
75d71ae5a4SJacob Faibussowitsch   {
769371c9d4SSatish Balay     Type old = x;
779371c9d4SSatish Balay     x        = x || y;
789371c9d4SSatish Balay     return old;
799371c9d4SSatish Balay   }
809371c9d4SSatish Balay };
819371c9d4SSatish Balay template <typename Type>
829371c9d4SSatish Balay struct LXOR {
83d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
84d71ae5a4SJacob Faibussowitsch   {
859371c9d4SSatish Balay     Type old = x;
869371c9d4SSatish Balay     x        = !x != !y;
879371c9d4SSatish Balay     return old;
889371c9d4SSatish Balay   }
899371c9d4SSatish Balay };
909371c9d4SSatish Balay template <typename Type>
919371c9d4SSatish Balay struct BAND {
92d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
93d71ae5a4SJacob Faibussowitsch   {
949371c9d4SSatish Balay     Type old = x;
959371c9d4SSatish Balay     x        = x & y;
969371c9d4SSatish Balay     return old;
979371c9d4SSatish Balay   }
989371c9d4SSatish Balay };
999371c9d4SSatish Balay template <typename Type>
1009371c9d4SSatish Balay struct BOR {
101d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
102d71ae5a4SJacob Faibussowitsch   {
1039371c9d4SSatish Balay     Type old = x;
1049371c9d4SSatish Balay     x        = x | y;
1059371c9d4SSatish Balay     return old;
1069371c9d4SSatish Balay   }
1079371c9d4SSatish Balay };
1089371c9d4SSatish Balay template <typename Type>
1099371c9d4SSatish Balay struct BXOR {
110d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const
111d71ae5a4SJacob Faibussowitsch   {
1129371c9d4SSatish Balay     Type old = x;
1139371c9d4SSatish Balay     x        = x ^ y;
1149371c9d4SSatish Balay     return old;
1159371c9d4SSatish Balay   }
1169371c9d4SSatish Balay };
1179371c9d4SSatish Balay template <typename PairType>
1189371c9d4SSatish Balay struct Minloc {
119d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION PairType operator()(PairType &x, PairType y) const
120d71ae5a4SJacob Faibussowitsch   {
121914b7a73SJunchao Zhang     PairType old = x;
122914b7a73SJunchao Zhang     if (y.first < x.first) x = y;
123914b7a73SJunchao Zhang     else if (y.first == x.first) x.second = PetscMin(x.second, y.second);
124914b7a73SJunchao Zhang     return old;
125914b7a73SJunchao Zhang   }
126914b7a73SJunchao Zhang };
1279371c9d4SSatish Balay template <typename PairType>
1289371c9d4SSatish Balay struct Maxloc {
129d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION PairType operator()(PairType &x, PairType y) const
130d71ae5a4SJacob Faibussowitsch   {
131914b7a73SJunchao Zhang     PairType old = x;
132914b7a73SJunchao Zhang     if (y.first > x.first) x = y;
133914b7a73SJunchao Zhang     else if (y.first == x.first) x.second = PetscMin(x.second, y.second); /* See MPI MAXLOC */
134914b7a73SJunchao Zhang     return old;
135914b7a73SJunchao Zhang   }
136914b7a73SJunchao Zhang };
137914b7a73SJunchao Zhang 
138914b7a73SJunchao Zhang /*====================================================================================*/
139914b7a73SJunchao Zhang /*                             Atomic operations                            */
140914b7a73SJunchao Zhang /*====================================================================================*/
1419371c9d4SSatish Balay template <typename Type>
1429371c9d4SSatish Balay struct AtomicInsert {
1439371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_assign(&x, y); }
1449371c9d4SSatish Balay };
1459371c9d4SSatish Balay template <typename Type>
1469371c9d4SSatish Balay struct AtomicAdd {
1479371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_add(&x, y); }
1489371c9d4SSatish Balay };
1499371c9d4SSatish Balay template <typename Type>
1509371c9d4SSatish Balay struct AtomicBAND {
1519371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_and(&x, y); }
1529371c9d4SSatish Balay };
1539371c9d4SSatish Balay template <typename Type>
1549371c9d4SSatish Balay struct AtomicBOR {
1559371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_or(&x, y); }
1569371c9d4SSatish Balay };
1579371c9d4SSatish Balay template <typename Type>
1589371c9d4SSatish Balay struct AtomicBXOR {
1599371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_fetch_xor(&x, y); }
1609371c9d4SSatish Balay };
1619371c9d4SSatish Balay template <typename Type>
1629371c9d4SSatish Balay struct AtomicLAND {
163d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const
164d71ae5a4SJacob Faibussowitsch   {
1659371c9d4SSatish Balay     const Type zero = 0, one = ~0;
1669371c9d4SSatish Balay     Kokkos::atomic_and(&x, y ? one : zero);
1679371c9d4SSatish Balay   }
1689371c9d4SSatish Balay };
1699371c9d4SSatish Balay template <typename Type>
1709371c9d4SSatish Balay struct AtomicLOR {
171d71ae5a4SJacob Faibussowitsch   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const
172d71ae5a4SJacob Faibussowitsch   {
1739371c9d4SSatish Balay     const Type zero = 0, one = 1;
1749371c9d4SSatish Balay     Kokkos::atomic_or(&x, y ? one : zero);
1759371c9d4SSatish Balay   }
1769371c9d4SSatish Balay };
1779371c9d4SSatish Balay template <typename Type>
1789371c9d4SSatish Balay struct AtomicMult {
1799371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_fetch_mul(&x, y); }
1809371c9d4SSatish Balay };
1819371c9d4SSatish Balay template <typename Type>
1829371c9d4SSatish Balay struct AtomicMin {
1839371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_fetch_min(&x, y); }
1849371c9d4SSatish Balay };
1859371c9d4SSatish Balay template <typename Type>
1869371c9d4SSatish Balay struct AtomicMax {
1879371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION void operator()(Type &x, Type y) const { Kokkos::atomic_fetch_max(&x, y); }
1889371c9d4SSatish Balay };
189914b7a73SJunchao Zhang /* TODO: struct AtomicLXOR  */
1909371c9d4SSatish Balay template <typename Type>
1919371c9d4SSatish Balay struct AtomicFetchAdd {
1929371c9d4SSatish Balay   KOKKOS_INLINE_FUNCTION Type operator()(Type &x, Type y) const { return Kokkos::atomic_fetch_add(&x, y); }
1939371c9d4SSatish Balay };
194914b7a73SJunchao Zhang 
195914b7a73SJunchao Zhang /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
196d71ae5a4SJacob Faibussowitsch static KOKKOS_INLINE_FUNCTION PetscInt MapTidToIndex(const PetscInt *opt, PetscInt tid)
197d71ae5a4SJacob Faibussowitsch {
198914b7a73SJunchao Zhang   PetscInt        i, j, k, m, n, r;
199914b7a73SJunchao Zhang   const PetscInt *offset, *start, *dx, *dy, *X, *Y;
200914b7a73SJunchao Zhang 
201914b7a73SJunchao Zhang   n      = opt[0];
202914b7a73SJunchao Zhang   offset = opt + 1;
203914b7a73SJunchao Zhang   start  = opt + n + 2;
204914b7a73SJunchao Zhang   dx     = opt + 2 * n + 2;
205914b7a73SJunchao Zhang   dy     = opt + 3 * n + 2;
206914b7a73SJunchao Zhang   X      = opt + 5 * n + 2;
207914b7a73SJunchao Zhang   Y      = opt + 6 * n + 2;
2089371c9d4SSatish Balay   for (r = 0; r < n; r++) {
2099371c9d4SSatish Balay     if (tid < offset[r + 1]) break;
2109371c9d4SSatish Balay   }
211914b7a73SJunchao Zhang   m = (tid - offset[r]);
212914b7a73SJunchao Zhang   k = m / (dx[r] * dy[r]);
213914b7a73SJunchao Zhang   j = (m - k * dx[r] * dy[r]) / dx[r];
214914b7a73SJunchao Zhang   i = m - k * dx[r] * dy[r] - j * dx[r];
215914b7a73SJunchao Zhang 
216914b7a73SJunchao Zhang   return (start[r] + k * X[r] * Y[r] + j * X[r] + i);
217914b7a73SJunchao Zhang }
218914b7a73SJunchao Zhang 
219914b7a73SJunchao Zhang /*====================================================================================*/
220914b7a73SJunchao Zhang /*  Wrappers for Pack/Unpack/Scatter kernels. Function pointers are stored in 'link'         */
221914b7a73SJunchao Zhang /*====================================================================================*/
222914b7a73SJunchao Zhang 
223914b7a73SJunchao Zhang /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
224914b7a73SJunchao Zhang    <Type> is PetscReal, which is the primitive type we operate on.
225914b7a73SJunchao Zhang    <bs>   is 16, which says <unit> contains 16 primitive types.
226914b7a73SJunchao Zhang    <BS>   is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
227914b7a73SJunchao Zhang    <EQ>   is 0, which is (bs == BS ? 1 : 0)
228914b7a73SJunchao Zhang 
229914b7a73SJunchao Zhang   If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
230914b7a73SJunchao Zhang   For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
231914b7a73SJunchao Zhang */
232914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ>
233d71ae5a4SJacob Faibussowitsch static PetscErrorCode Pack(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *data_, void *buf_)
234d71ae5a4SJacob Faibussowitsch {
235914b7a73SJunchao Zhang   const PetscInt       *iopt = opt ? opt->array : NULL;
236914b7a73SJunchao Zhang   const PetscInt        M = EQ ? 1 : link->bs / BS, MBS = M * BS; /* If EQ, then MBS will be a compile-time const */
237914b7a73SJunchao Zhang   const Type           *data = static_cast<const Type *>(data_);
238914b7a73SJunchao Zhang   Type                 *buf  = static_cast<Type *>(buf_);
239*524fe776SJunchao Zhang   DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace();
240914b7a73SJunchao Zhang 
241914b7a73SJunchao Zhang   PetscFunctionBegin;
2429371c9d4SSatish Balay   Kokkos::parallel_for(
2439371c9d4SSatish Balay     Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) {
244914b7a73SJunchao Zhang       /* iopt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
245914b7a73SJunchao Zhang        iopt == NULL && idx == NULL ==> the indices are contiguous;
246914b7a73SJunchao Zhang      */
247914b7a73SJunchao Zhang       PetscInt t = (iopt ? MapTidToIndex(iopt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
248914b7a73SJunchao Zhang       PetscInt s = tid * MBS;
249914b7a73SJunchao Zhang       for (int i = 0; i < MBS; i++) buf[s + i] = data[t + i];
250914b7a73SJunchao Zhang     });
2513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
252914b7a73SJunchao Zhang }
253914b7a73SJunchao Zhang 
254914b7a73SJunchao Zhang template <typename Type, class Op, PetscInt BS, PetscInt EQ>
255d71ae5a4SJacob Faibussowitsch static PetscErrorCode UnpackAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data_, const void *buf_)
256d71ae5a4SJacob Faibussowitsch {
257914b7a73SJunchao Zhang   Op                    op;
258914b7a73SJunchao Zhang   const PetscInt       *iopt = opt ? opt->array : NULL;
259914b7a73SJunchao Zhang   const PetscInt        M = EQ ? 1 : link->bs / BS, MBS = M * BS;
260914b7a73SJunchao Zhang   Type                 *data = static_cast<Type *>(data_);
261914b7a73SJunchao Zhang   const Type           *buf  = static_cast<const Type *>(buf_);
262*524fe776SJunchao Zhang   DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace();
263914b7a73SJunchao Zhang 
264914b7a73SJunchao Zhang   PetscFunctionBegin;
2659371c9d4SSatish Balay   Kokkos::parallel_for(
2669371c9d4SSatish Balay     Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) {
267914b7a73SJunchao Zhang       PetscInt t = (iopt ? MapTidToIndex(iopt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
268914b7a73SJunchao Zhang       PetscInt s = tid * MBS;
269914b7a73SJunchao Zhang       for (int i = 0; i < MBS; i++) op(data[t + i], buf[s + i]);
270914b7a73SJunchao Zhang     });
2713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
272914b7a73SJunchao Zhang }
273914b7a73SJunchao Zhang 
274914b7a73SJunchao Zhang template <typename Type, class Op, PetscInt BS, PetscInt EQ>
275d71ae5a4SJacob Faibussowitsch static PetscErrorCode FetchAndOp(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *data, void *buf)
276d71ae5a4SJacob Faibussowitsch {
277914b7a73SJunchao Zhang   Op                    op;
278914b7a73SJunchao Zhang   const PetscInt       *ropt = opt ? opt->array : NULL;
279914b7a73SJunchao Zhang   const PetscInt        M = EQ ? 1 : link->bs / BS, MBS = M * BS;
280914b7a73SJunchao Zhang   Type                 *rootdata = static_cast<Type *>(data), *leafbuf = static_cast<Type *>(buf);
281*524fe776SJunchao Zhang   DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace();
282914b7a73SJunchao Zhang 
283914b7a73SJunchao Zhang   PetscFunctionBegin;
2849371c9d4SSatish Balay   Kokkos::parallel_for(
2859371c9d4SSatish Balay     Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) {
286914b7a73SJunchao Zhang       PetscInt r = (ropt ? MapTidToIndex(ropt, tid) : (idx ? idx[tid] : start + tid)) * MBS;
287914b7a73SJunchao Zhang       PetscInt l = tid * MBS;
288914b7a73SJunchao Zhang       for (int i = 0; i < MBS; i++) leafbuf[l + i] = op(rootdata[r + i], leafbuf[l + i]);
289914b7a73SJunchao Zhang     });
2903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
291914b7a73SJunchao Zhang }
292914b7a73SJunchao Zhang 
293914b7a73SJunchao Zhang template <typename Type, class Op, PetscInt BS, PetscInt EQ>
294d71ae5a4SJacob Faibussowitsch static PetscErrorCode ScatterAndOp(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src_, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst_)
295d71ae5a4SJacob Faibussowitsch {
296914b7a73SJunchao Zhang   PetscInt              srcx = 0, srcy = 0, srcX = 0, srcY = 0, dstx = 0, dsty = 0, dstX = 0, dstY = 0;
297914b7a73SJunchao Zhang   const PetscInt        M = (EQ) ? 1 : link->bs / BS, MBS = M * BS;
298914b7a73SJunchao Zhang   const Type           *src  = static_cast<const Type *>(src_);
299914b7a73SJunchao Zhang   Type                 *dst  = static_cast<Type *>(dst_);
300*524fe776SJunchao Zhang   DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace();
301914b7a73SJunchao Zhang 
302914b7a73SJunchao Zhang   PetscFunctionBegin;
303914b7a73SJunchao Zhang   /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
3049371c9d4SSatish Balay   if (srcOpt) {
3059371c9d4SSatish Balay     srcx     = srcOpt->dx[0];
3069371c9d4SSatish Balay     srcy     = srcOpt->dy[0];
3079371c9d4SSatish Balay     srcX     = srcOpt->X[0];
3089371c9d4SSatish Balay     srcY     = srcOpt->Y[0];
3099371c9d4SSatish Balay     srcStart = srcOpt->start[0];
3109371c9d4SSatish Balay     srcIdx   = NULL;
3119371c9d4SSatish Balay   } else if (!srcIdx) {
3129371c9d4SSatish Balay     srcx = srcX = count;
3139371c9d4SSatish Balay     srcy = srcY = 1;
3149371c9d4SSatish Balay   }
315914b7a73SJunchao Zhang 
3169371c9d4SSatish Balay   if (dstOpt) {
3179371c9d4SSatish Balay     dstx     = dstOpt->dx[0];
3189371c9d4SSatish Balay     dsty     = dstOpt->dy[0];
3199371c9d4SSatish Balay     dstX     = dstOpt->X[0];
3209371c9d4SSatish Balay     dstY     = dstOpt->Y[0];
3219371c9d4SSatish Balay     dstStart = dstOpt->start[0];
3229371c9d4SSatish Balay     dstIdx   = NULL;
3239371c9d4SSatish Balay   } else if (!dstIdx) {
3249371c9d4SSatish Balay     dstx = dstX = count;
3259371c9d4SSatish Balay     dsty = dstY = 1;
3269371c9d4SSatish Balay   }
327914b7a73SJunchao Zhang 
3289371c9d4SSatish Balay   Kokkos::parallel_for(
3299371c9d4SSatish Balay     Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) {
330914b7a73SJunchao Zhang       PetscInt i, j, k, s, t;
331914b7a73SJunchao Zhang       Op       op;
332914b7a73SJunchao Zhang       if (!srcIdx) { /* src is in 3D */
333914b7a73SJunchao Zhang         k = tid / (srcx * srcy);
334914b7a73SJunchao Zhang         j = (tid - k * srcx * srcy) / srcx;
335914b7a73SJunchao Zhang         i = tid - k * srcx * srcy - j * srcx;
336914b7a73SJunchao Zhang         s = srcStart + k * srcX * srcY + j * srcX + i;
337914b7a73SJunchao Zhang       } else { /* src is contiguous */
338914b7a73SJunchao Zhang         s = srcIdx[tid];
339914b7a73SJunchao Zhang       }
340914b7a73SJunchao Zhang 
341914b7a73SJunchao Zhang       if (!dstIdx) { /* 3D */
342914b7a73SJunchao Zhang         k = tid / (dstx * dsty);
343914b7a73SJunchao Zhang         j = (tid - k * dstx * dsty) / dstx;
344914b7a73SJunchao Zhang         i = tid - k * dstx * dsty - j * dstx;
345914b7a73SJunchao Zhang         t = dstStart + k * dstX * dstY + j * dstX + i;
346914b7a73SJunchao Zhang       } else { /* contiguous */
347914b7a73SJunchao Zhang         t = dstIdx[tid];
348914b7a73SJunchao Zhang       }
349914b7a73SJunchao Zhang 
350914b7a73SJunchao Zhang       s *= MBS;
351914b7a73SJunchao Zhang       t *= MBS;
352914b7a73SJunchao Zhang       for (i = 0; i < MBS; i++) op(dst[t + i], src[s + i]);
353914b7a73SJunchao Zhang     });
3543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
355914b7a73SJunchao Zhang }
356914b7a73SJunchao Zhang 
357914b7a73SJunchao Zhang /* Specialization for Insert since we may use memcpy */
358914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ>
359d71ae5a4SJacob Faibussowitsch static PetscErrorCode ScatterAndInsert(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src_, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst_)
360d71ae5a4SJacob Faibussowitsch {
361914b7a73SJunchao Zhang   const Type           *src  = static_cast<const Type *>(src_);
362914b7a73SJunchao Zhang   Type                 *dst  = static_cast<Type *>(dst_);
363*524fe776SJunchao Zhang   DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace();
364914b7a73SJunchao Zhang 
365914b7a73SJunchao Zhang   PetscFunctionBegin;
3663ba16761SJacob Faibussowitsch   if (!count) PetscFunctionReturn(PETSC_SUCCESS);
367914b7a73SJunchao Zhang   /*src and dst are contiguous */
368914b7a73SJunchao Zhang   if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
369914b7a73SJunchao Zhang     size_t              sz = count * link->unitbytes;
370914b7a73SJunchao Zhang     deviceBuffer_t      dbuf(reinterpret_cast<char *>(dst + dstStart * link->bs), sz);
371914b7a73SJunchao Zhang     deviceConstBuffer_t sbuf(reinterpret_cast<const char *>(src + srcStart * link->bs), sz);
372914b7a73SJunchao Zhang     Kokkos::deep_copy(exec, dbuf, sbuf);
373914b7a73SJunchao Zhang   } else {
3749566063dSJacob Faibussowitsch     PetscCall(ScatterAndOp<Type, Insert<Type>, BS, EQ>(link, count, srcStart, srcOpt, srcIdx, src, dstStart, dstOpt, dstIdx, dst));
375914b7a73SJunchao Zhang   }
3763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
377914b7a73SJunchao Zhang }
378914b7a73SJunchao Zhang 
379914b7a73SJunchao Zhang template <typename Type, class Op, PetscInt BS, PetscInt EQ>
380d71ae5a4SJacob Faibussowitsch static PetscErrorCode FetchAndOpLocal(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata_, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata_, void *leafupdate_)
381d71ae5a4SJacob Faibussowitsch {
382914b7a73SJunchao Zhang   Op                    op;
383914b7a73SJunchao Zhang   const PetscInt        M = (EQ) ? 1 : link->bs / BS, MBS = M * BS;
384914b7a73SJunchao Zhang   const PetscInt       *ropt     = rootopt ? rootopt->array : NULL;
385914b7a73SJunchao Zhang   const PetscInt       *lopt     = leafopt ? leafopt->array : NULL;
386914b7a73SJunchao Zhang   Type                 *rootdata = static_cast<Type *>(rootdata_), *leafupdate = static_cast<Type *>(leafupdate_);
387914b7a73SJunchao Zhang   const Type           *leafdata = static_cast<const Type *>(leafdata_);
388*524fe776SJunchao Zhang   DeviceExecutionSpace &exec     = PetscGetKokkosExecutionSpace();
389914b7a73SJunchao Zhang 
390914b7a73SJunchao Zhang   PetscFunctionBegin;
3919371c9d4SSatish Balay   Kokkos::parallel_for(
3929371c9d4SSatish Balay     Kokkos::RangePolicy<DeviceExecutionSpace>(exec, 0, count), KOKKOS_LAMBDA(PetscInt tid) {
393914b7a73SJunchao Zhang       PetscInt r = (ropt ? MapTidToIndex(ropt, tid) : (rootidx ? rootidx[tid] : rootstart + tid)) * MBS;
394914b7a73SJunchao Zhang       PetscInt l = (lopt ? MapTidToIndex(lopt, tid) : (leafidx ? leafidx[tid] : leafstart + tid)) * MBS;
395914b7a73SJunchao Zhang       for (int i = 0; i < MBS; i++) leafupdate[l + i] = op(rootdata[r + i], leafdata[l + i]);
396914b7a73SJunchao Zhang     });
3973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
398914b7a73SJunchao Zhang }
399914b7a73SJunchao Zhang 
400914b7a73SJunchao Zhang /*====================================================================================*/
401914b7a73SJunchao Zhang /*  Init various types and instantiate pack/unpack function pointers                  */
402914b7a73SJunchao Zhang /*====================================================================================*/
403914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ>
404d71ae5a4SJacob Faibussowitsch static void PackInit_RealType(PetscSFLink link)
405d71ae5a4SJacob Faibussowitsch {
406914b7a73SJunchao Zhang   /* Pack/unpack for remote communication */
407914b7a73SJunchao Zhang   link->d_Pack            = Pack<Type, BS, EQ>;
408914b7a73SJunchao Zhang   link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>;
409914b7a73SJunchao Zhang   link->d_UnpackAndAdd    = UnpackAndOp<Type, Add<Type>, BS, EQ>;
410914b7a73SJunchao Zhang   link->d_UnpackAndMult   = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
411914b7a73SJunchao Zhang   link->d_UnpackAndMin    = UnpackAndOp<Type, Min<Type>, BS, EQ>;
412914b7a73SJunchao Zhang   link->d_UnpackAndMax    = UnpackAndOp<Type, Max<Type>, BS, EQ>;
413914b7a73SJunchao Zhang   link->d_FetchAndAdd     = FetchAndOp<Type, Add<Type>, BS, EQ>;
414914b7a73SJunchao Zhang   /* Scatter for local communication */
415914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>; /* Has special optimizations */
416914b7a73SJunchao Zhang   link->d_ScatterAndAdd    = ScatterAndOp<Type, Add<Type>, BS, EQ>;
417914b7a73SJunchao Zhang   link->d_ScatterAndMult   = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
418914b7a73SJunchao Zhang   link->d_ScatterAndMin    = ScatterAndOp<Type, Min<Type>, BS, EQ>;
419914b7a73SJunchao Zhang   link->d_ScatterAndMax    = ScatterAndOp<Type, Max<Type>, BS, EQ>;
420914b7a73SJunchao Zhang   link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
421914b7a73SJunchao Zhang   /* Atomic versions when there are data-race possibilities */
422914b7a73SJunchao Zhang   link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
423914b7a73SJunchao Zhang   link->da_UnpackAndAdd    = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
424914b7a73SJunchao Zhang   link->da_UnpackAndMult   = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
425914b7a73SJunchao Zhang   link->da_UnpackAndMin    = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
426914b7a73SJunchao Zhang   link->da_UnpackAndMax    = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
427914b7a73SJunchao Zhang   link->da_FetchAndAdd     = FetchAndOp<Type, AtomicFetchAdd<Type>, BS, EQ>;
428914b7a73SJunchao Zhang 
429914b7a73SJunchao Zhang   link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
430914b7a73SJunchao Zhang   link->da_ScatterAndAdd    = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
431914b7a73SJunchao Zhang   link->da_ScatterAndMult   = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
432914b7a73SJunchao Zhang   link->da_ScatterAndMin    = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
433914b7a73SJunchao Zhang   link->da_ScatterAndMax    = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
434914b7a73SJunchao Zhang   link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicFetchAdd<Type>, BS, EQ>;
435914b7a73SJunchao Zhang }
436914b7a73SJunchao Zhang 
437914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ>
438d71ae5a4SJacob Faibussowitsch static void PackInit_IntegerType(PetscSFLink link)
439d71ae5a4SJacob Faibussowitsch {
440914b7a73SJunchao Zhang   link->d_Pack            = Pack<Type, BS, EQ>;
441914b7a73SJunchao Zhang   link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>;
442914b7a73SJunchao Zhang   link->d_UnpackAndAdd    = UnpackAndOp<Type, Add<Type>, BS, EQ>;
443914b7a73SJunchao Zhang   link->d_UnpackAndMult   = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
444914b7a73SJunchao Zhang   link->d_UnpackAndMin    = UnpackAndOp<Type, Min<Type>, BS, EQ>;
445914b7a73SJunchao Zhang   link->d_UnpackAndMax    = UnpackAndOp<Type, Max<Type>, BS, EQ>;
446914b7a73SJunchao Zhang   link->d_UnpackAndLAND   = UnpackAndOp<Type, LAND<Type>, BS, EQ>;
447914b7a73SJunchao Zhang   link->d_UnpackAndLOR    = UnpackAndOp<Type, LOR<Type>, BS, EQ>;
448914b7a73SJunchao Zhang   link->d_UnpackAndLXOR   = UnpackAndOp<Type, LXOR<Type>, BS, EQ>;
449914b7a73SJunchao Zhang   link->d_UnpackAndBAND   = UnpackAndOp<Type, BAND<Type>, BS, EQ>;
450914b7a73SJunchao Zhang   link->d_UnpackAndBOR    = UnpackAndOp<Type, BOR<Type>, BS, EQ>;
451914b7a73SJunchao Zhang   link->d_UnpackAndBXOR   = UnpackAndOp<Type, BXOR<Type>, BS, EQ>;
452914b7a73SJunchao Zhang   link->d_FetchAndAdd     = FetchAndOp<Type, Add<Type>, BS, EQ>;
453914b7a73SJunchao Zhang 
454914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
455914b7a73SJunchao Zhang   link->d_ScatterAndAdd    = ScatterAndOp<Type, Add<Type>, BS, EQ>;
456914b7a73SJunchao Zhang   link->d_ScatterAndMult   = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
457914b7a73SJunchao Zhang   link->d_ScatterAndMin    = ScatterAndOp<Type, Min<Type>, BS, EQ>;
458914b7a73SJunchao Zhang   link->d_ScatterAndMax    = ScatterAndOp<Type, Max<Type>, BS, EQ>;
459914b7a73SJunchao Zhang   link->d_ScatterAndLAND   = ScatterAndOp<Type, LAND<Type>, BS, EQ>;
460914b7a73SJunchao Zhang   link->d_ScatterAndLOR    = ScatterAndOp<Type, LOR<Type>, BS, EQ>;
461914b7a73SJunchao Zhang   link->d_ScatterAndLXOR   = ScatterAndOp<Type, LXOR<Type>, BS, EQ>;
462914b7a73SJunchao Zhang   link->d_ScatterAndBAND   = ScatterAndOp<Type, BAND<Type>, BS, EQ>;
463914b7a73SJunchao Zhang   link->d_ScatterAndBOR    = ScatterAndOp<Type, BOR<Type>, BS, EQ>;
464914b7a73SJunchao Zhang   link->d_ScatterAndBXOR   = ScatterAndOp<Type, BXOR<Type>, BS, EQ>;
465914b7a73SJunchao Zhang   link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
466914b7a73SJunchao Zhang 
467914b7a73SJunchao Zhang   link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
468914b7a73SJunchao Zhang   link->da_UnpackAndAdd    = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
469914b7a73SJunchao Zhang   link->da_UnpackAndMult   = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
470914b7a73SJunchao Zhang   link->da_UnpackAndMin    = UnpackAndOp<Type, AtomicMin<Type>, BS, EQ>;
471914b7a73SJunchao Zhang   link->da_UnpackAndMax    = UnpackAndOp<Type, AtomicMax<Type>, BS, EQ>;
472914b7a73SJunchao Zhang   link->da_UnpackAndLAND   = UnpackAndOp<Type, AtomicLAND<Type>, BS, EQ>;
473914b7a73SJunchao Zhang   link->da_UnpackAndLOR    = UnpackAndOp<Type, AtomicLOR<Type>, BS, EQ>;
474914b7a73SJunchao Zhang   link->da_UnpackAndBAND   = UnpackAndOp<Type, AtomicBAND<Type>, BS, EQ>;
475914b7a73SJunchao Zhang   link->da_UnpackAndBOR    = UnpackAndOp<Type, AtomicBOR<Type>, BS, EQ>;
476914b7a73SJunchao Zhang   link->da_UnpackAndBXOR   = UnpackAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
477914b7a73SJunchao Zhang   link->da_FetchAndAdd     = FetchAndOp<Type, AtomicFetchAdd<Type>, BS, EQ>;
478914b7a73SJunchao Zhang 
479914b7a73SJunchao Zhang   link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
480914b7a73SJunchao Zhang   link->da_ScatterAndAdd    = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
481914b7a73SJunchao Zhang   link->da_ScatterAndMult   = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
482914b7a73SJunchao Zhang   link->da_ScatterAndMin    = ScatterAndOp<Type, AtomicMin<Type>, BS, EQ>;
483914b7a73SJunchao Zhang   link->da_ScatterAndMax    = ScatterAndOp<Type, AtomicMax<Type>, BS, EQ>;
484914b7a73SJunchao Zhang   link->da_ScatterAndLAND   = ScatterAndOp<Type, AtomicLAND<Type>, BS, EQ>;
485914b7a73SJunchao Zhang   link->da_ScatterAndLOR    = ScatterAndOp<Type, AtomicLOR<Type>, BS, EQ>;
486914b7a73SJunchao Zhang   link->da_ScatterAndBAND   = ScatterAndOp<Type, AtomicBAND<Type>, BS, EQ>;
487914b7a73SJunchao Zhang   link->da_ScatterAndBOR    = ScatterAndOp<Type, AtomicBOR<Type>, BS, EQ>;
488914b7a73SJunchao Zhang   link->da_ScatterAndBXOR   = ScatterAndOp<Type, AtomicBXOR<Type>, BS, EQ>;
489914b7a73SJunchao Zhang   link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicFetchAdd<Type>, BS, EQ>;
490914b7a73SJunchao Zhang }
491914b7a73SJunchao Zhang 
492914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
493914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ>
494d71ae5a4SJacob Faibussowitsch static void PackInit_ComplexType(PetscSFLink link)
495d71ae5a4SJacob Faibussowitsch {
496914b7a73SJunchao Zhang   link->d_Pack            = Pack<Type, BS, EQ>;
497914b7a73SJunchao Zhang   link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, BS, EQ>;
498914b7a73SJunchao Zhang   link->d_UnpackAndAdd    = UnpackAndOp<Type, Add<Type>, BS, EQ>;
499914b7a73SJunchao Zhang   link->d_UnpackAndMult   = UnpackAndOp<Type, Mult<Type>, BS, EQ>;
500914b7a73SJunchao Zhang   link->d_FetchAndAdd     = FetchAndOp<Type, Add<Type>, BS, EQ>;
501914b7a73SJunchao Zhang 
502914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
503914b7a73SJunchao Zhang   link->d_ScatterAndAdd    = ScatterAndOp<Type, Add<Type>, BS, EQ>;
504914b7a73SJunchao Zhang   link->d_ScatterAndMult   = ScatterAndOp<Type, Mult<Type>, BS, EQ>;
505914b7a73SJunchao Zhang   link->d_FetchAndAddLocal = FetchAndOpLocal<Type, Add<Type>, BS, EQ>;
506914b7a73SJunchao Zhang 
507914b7a73SJunchao Zhang   link->da_UnpackAndInsert = UnpackAndOp<Type, AtomicInsert<Type>, BS, EQ>;
508914b7a73SJunchao Zhang   link->da_UnpackAndAdd    = UnpackAndOp<Type, AtomicAdd<Type>, BS, EQ>;
509914b7a73SJunchao Zhang   link->da_UnpackAndMult   = UnpackAndOp<Type, AtomicMult<Type>, BS, EQ>;
510914b7a73SJunchao Zhang   link->da_FetchAndAdd     = FetchAndOp<Type, AtomicFetchAdd<Type>, BS, EQ>;
511914b7a73SJunchao Zhang 
512914b7a73SJunchao Zhang   link->da_ScatterAndInsert = ScatterAndOp<Type, AtomicInsert<Type>, BS, EQ>;
513914b7a73SJunchao Zhang   link->da_ScatterAndAdd    = ScatterAndOp<Type, AtomicAdd<Type>, BS, EQ>;
514914b7a73SJunchao Zhang   link->da_ScatterAndMult   = ScatterAndOp<Type, AtomicMult<Type>, BS, EQ>;
515914b7a73SJunchao Zhang   link->da_FetchAndAddLocal = FetchAndOpLocal<Type, AtomicFetchAdd<Type>, BS, EQ>;
516914b7a73SJunchao Zhang }
517914b7a73SJunchao Zhang #endif
518914b7a73SJunchao Zhang 
519914b7a73SJunchao Zhang template <typename Type>
520d71ae5a4SJacob Faibussowitsch static void PackInit_PairType(PetscSFLink link)
521d71ae5a4SJacob Faibussowitsch {
522914b7a73SJunchao Zhang   link->d_Pack            = Pack<Type, 1, 1>;
523914b7a73SJunchao Zhang   link->d_UnpackAndInsert = UnpackAndOp<Type, Insert<Type>, 1, 1>;
524914b7a73SJunchao Zhang   link->d_UnpackAndMaxloc = UnpackAndOp<Type, Maxloc<Type>, 1, 1>;
525914b7a73SJunchao Zhang   link->d_UnpackAndMinloc = UnpackAndOp<Type, Minloc<Type>, 1, 1>;
526914b7a73SJunchao Zhang 
527914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndOp<Type, Insert<Type>, 1, 1>;
528914b7a73SJunchao Zhang   link->d_ScatterAndMaxloc = ScatterAndOp<Type, Maxloc<Type>, 1, 1>;
529914b7a73SJunchao Zhang   link->d_ScatterAndMinloc = ScatterAndOp<Type, Minloc<Type>, 1, 1>;
530914b7a73SJunchao Zhang   /* Atomics for pair types are not implemented yet */
531914b7a73SJunchao Zhang }
532914b7a73SJunchao Zhang 
533914b7a73SJunchao Zhang template <typename Type, PetscInt BS, PetscInt EQ>
534d71ae5a4SJacob Faibussowitsch static void PackInit_DumbType(PetscSFLink link)
535d71ae5a4SJacob Faibussowitsch {
536914b7a73SJunchao Zhang   link->d_Pack             = Pack<Type, BS, EQ>;
537914b7a73SJunchao Zhang   link->d_UnpackAndInsert  = UnpackAndOp<Type, Insert<Type>, BS, EQ>;
538914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndInsert<Type, BS, EQ>;
539914b7a73SJunchao Zhang   /* Atomics for dumb types are not implemented yet */
540914b7a73SJunchao Zhang }
541914b7a73SJunchao Zhang 
542f4af43b4SJunchao Zhang /*
543f4af43b4SJunchao Zhang   Kokkos::DefaultExecutionSpace(stream) is a reference counted pointer object. It has a bug
544f4af43b4SJunchao Zhang   that one is not able to repeatedly create and destroy the object. SF's original design was each
545f4af43b4SJunchao Zhang   SFLink has a stream (NULL or not) and hence an execution space object. The bug prevents us from
546f4af43b4SJunchao Zhang   destroying multiple SFLinks with NULL stream and the default execution space object. To avoid
547f4af43b4SJunchao Zhang   memory leaks, SF_Kokkos only supports NULL stream, which is also petsc's default scheme. SF_Kokkos
548f4af43b4SJunchao Zhang   does not do its own new/delete. It just uses Kokkos::DefaultExecutionSpace(), which is a singliton
549f4af43b4SJunchao Zhang   object in Kokkos.
550f4af43b4SJunchao Zhang */
551f4af43b4SJunchao Zhang /*
552914b7a73SJunchao Zhang static PetscErrorCode PetscSFLinkDestroy_Kokkos(PetscSFLink link)
553914b7a73SJunchao Zhang {
554914b7a73SJunchao Zhang   PetscFunctionBegin;
5553ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
556914b7a73SJunchao Zhang }
557f4af43b4SJunchao Zhang */
558914b7a73SJunchao Zhang 
559914b7a73SJunchao Zhang /* Some device-specific utilities */
560d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkSyncDevice_Kokkos(PetscSFLink PETSC_UNUSED link)
561d71ae5a4SJacob Faibussowitsch {
562914b7a73SJunchao Zhang   PetscFunctionBegin;
563914b7a73SJunchao Zhang   Kokkos::fence();
5643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
565914b7a73SJunchao Zhang }
566914b7a73SJunchao Zhang 
567d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkSyncStream_Kokkos(PetscSFLink PETSC_UNUSED link)
568d71ae5a4SJacob Faibussowitsch {
569*524fe776SJunchao Zhang   DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace();
570914b7a73SJunchao Zhang   PetscFunctionBegin;
571914b7a73SJunchao Zhang   exec.fence();
5723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
573914b7a73SJunchao Zhang }
574914b7a73SJunchao Zhang 
575d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkMemcpy_Kokkos(PetscSFLink PETSC_UNUSED link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
576d71ae5a4SJacob Faibussowitsch {
577*524fe776SJunchao Zhang   DeviceExecutionSpace &exec = PetscGetKokkosExecutionSpace();
578914b7a73SJunchao Zhang 
579914b7a73SJunchao Zhang   PetscFunctionBegin;
5803ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
58171438e86SJunchao Zhang   if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) {
5829566063dSJacob Faibussowitsch     PetscCall(PetscMemcpy(dst, src, n));
583914b7a73SJunchao Zhang   } else {
58471438e86SJunchao Zhang     if (PetscMemTypeDevice(dstmtype) && PetscMemTypeHost(srcmtype)) {
585914b7a73SJunchao Zhang       deviceBuffer_t    dbuf(static_cast<char *>(dst), n);
586914b7a73SJunchao Zhang       HostConstBuffer_t sbuf(static_cast<const char *>(src), n);
587914b7a73SJunchao Zhang       Kokkos::deep_copy(exec, dbuf, sbuf);
5889566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu(n));
58971438e86SJunchao Zhang     } else if (PetscMemTypeHost(dstmtype) && PetscMemTypeDevice(srcmtype)) {
590914b7a73SJunchao Zhang       HostBuffer_t        dbuf(static_cast<char *>(dst), n);
591914b7a73SJunchao Zhang       deviceConstBuffer_t sbuf(static_cast<const char *>(src), n);
592914b7a73SJunchao Zhang       Kokkos::deep_copy(exec, dbuf, sbuf);
5939566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuToCpu(n));
59471438e86SJunchao Zhang     } else if (PetscMemTypeDevice(dstmtype) && PetscMemTypeDevice(srcmtype)) {
595914b7a73SJunchao Zhang       deviceBuffer_t      dbuf(static_cast<char *>(dst), n);
596914b7a73SJunchao Zhang       deviceConstBuffer_t sbuf(static_cast<const char *>(src), n);
597914b7a73SJunchao Zhang       Kokkos::deep_copy(exec, dbuf, sbuf);
598914b7a73SJunchao Zhang     }
599914b7a73SJunchao Zhang   }
6003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
601914b7a73SJunchao Zhang }
602914b7a73SJunchao Zhang 
603d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFMalloc_Kokkos(PetscMemType mtype, size_t size, void **ptr)
604d71ae5a4SJacob Faibussowitsch {
605914b7a73SJunchao Zhang   PetscFunctionBegin;
6069566063dSJacob Faibussowitsch   if (PetscMemTypeHost(mtype)) PetscCall(PetscMalloc(size, ptr));
60771438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype)) {
6089566063dSJacob Faibussowitsch     if (!PetscKokkosInitialized) PetscCall(PetscKokkosInitializeCheck());
60945639126SStefano Zampini     *ptr = Kokkos::kokkos_malloc<DeviceMemorySpace>(size);
61098921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
6113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
612914b7a73SJunchao Zhang }
613914b7a73SJunchao Zhang 
614d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFFree_Kokkos(PetscMemType mtype, void *ptr)
615d71ae5a4SJacob Faibussowitsch {
616914b7a73SJunchao Zhang   PetscFunctionBegin;
6179566063dSJacob Faibussowitsch   if (PetscMemTypeHost(mtype)) PetscCall(PetscFree(ptr));
6189371c9d4SSatish Balay   else if (PetscMemTypeDevice(mtype)) {
6199371c9d4SSatish Balay     Kokkos::kokkos_free<DeviceMemorySpace>(ptr);
6209371c9d4SSatish Balay   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Wrong PetscMemType %d", (int)mtype);
6213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
622914b7a73SJunchao Zhang }
623914b7a73SJunchao Zhang 
62471438e86SJunchao Zhang /* Destructor when the link uses MPI for communication */
625d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkDestroy_Kokkos(PetscSF sf, PetscSFLink link)
626d71ae5a4SJacob Faibussowitsch {
62771438e86SJunchao Zhang   PetscFunctionBegin;
62871438e86SJunchao Zhang   for (int i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
6299566063dSJacob Faibussowitsch     PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]));
6309566063dSJacob Faibussowitsch     PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_DEVICE, link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]));
63171438e86SJunchao Zhang   }
6323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
63371438e86SJunchao Zhang }
634914b7a73SJunchao Zhang 
635914b7a73SJunchao Zhang /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
636d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF PETSC_UNUSED sf, PetscSFLink link, MPI_Datatype unit)
637d71ae5a4SJacob Faibussowitsch {
638914b7a73SJunchao Zhang   PetscInt  nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
639914b7a73SJunchao Zhang   PetscBool is2Int, is2PetscInt;
640914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
641914b7a73SJunchao Zhang   PetscInt nPetscComplex = 0;
642914b7a73SJunchao Zhang #endif
643914b7a73SJunchao Zhang 
644914b7a73SJunchao Zhang   PetscFunctionBegin;
6453ba16761SJacob Faibussowitsch   if (link->deviceinited) PetscFunctionReturn(PETSC_SUCCESS);
6469566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
6479566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar));
6489566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar));
649914b7a73SJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
6509566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt));
6519566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt));
6529566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal));
653914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
6549566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex));
655914b7a73SJunchao Zhang #endif
6569566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int));
6579566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt));
658914b7a73SJunchao Zhang 
659914b7a73SJunchao Zhang   if (is2Int) {
660914b7a73SJunchao Zhang     PackInit_PairType<Kokkos::pair<int, int>>(link);
661914b7a73SJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
662914b7a73SJunchao Zhang     PackInit_PairType<Kokkos::pair<PetscInt, PetscInt>>(link);
663914b7a73SJunchao Zhang   } else if (nPetscReal) {
664d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE) /* Skip the unimportant stuff to speed up SF device compilation time */
6659371c9d4SSatish Balay     if (nPetscReal == 8) PackInit_RealType<PetscReal, 8, 1>(link);
6669371c9d4SSatish Balay     else if (nPetscReal % 8 == 0) PackInit_RealType<PetscReal, 8, 0>(link);
6679371c9d4SSatish Balay     else if (nPetscReal == 4) PackInit_RealType<PetscReal, 4, 1>(link);
6689371c9d4SSatish Balay     else if (nPetscReal % 4 == 0) PackInit_RealType<PetscReal, 4, 0>(link);
6699371c9d4SSatish Balay     else if (nPetscReal == 2) PackInit_RealType<PetscReal, 2, 1>(link);
6709371c9d4SSatish Balay     else if (nPetscReal % 2 == 0) PackInit_RealType<PetscReal, 2, 0>(link);
6719371c9d4SSatish Balay     else if (nPetscReal == 1) PackInit_RealType<PetscReal, 1, 1>(link);
6729371c9d4SSatish Balay     else if (nPetscReal % 1 == 0)
673eee4e20aSJunchao Zhang #endif
674d941a2f0SJunchao Zhang       PackInit_RealType<PetscReal, 1, 0>(link);
675874d28e3SJunchao Zhang   } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
676d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE)
6779371c9d4SSatish Balay     if (nPetscInt == 8) PackInit_IntegerType<llint, 8, 1>(link);
6789371c9d4SSatish Balay     else if (nPetscInt % 8 == 0) PackInit_IntegerType<llint, 8, 0>(link);
6799371c9d4SSatish Balay     else if (nPetscInt == 4) PackInit_IntegerType<llint, 4, 1>(link);
6809371c9d4SSatish Balay     else if (nPetscInt % 4 == 0) PackInit_IntegerType<llint, 4, 0>(link);
6819371c9d4SSatish Balay     else if (nPetscInt == 2) PackInit_IntegerType<llint, 2, 1>(link);
6829371c9d4SSatish Balay     else if (nPetscInt % 2 == 0) PackInit_IntegerType<llint, 2, 0>(link);
6839371c9d4SSatish Balay     else if (nPetscInt == 1) PackInit_IntegerType<llint, 1, 1>(link);
6849371c9d4SSatish Balay     else if (nPetscInt % 1 == 0)
685eee4e20aSJunchao Zhang #endif
686d941a2f0SJunchao Zhang       PackInit_IntegerType<llint, 1, 0>(link);
687914b7a73SJunchao Zhang   } else if (nInt) {
688d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE)
6899371c9d4SSatish Balay     if (nInt == 8) PackInit_IntegerType<int, 8, 1>(link);
6909371c9d4SSatish Balay     else if (nInt % 8 == 0) PackInit_IntegerType<int, 8, 0>(link);
6919371c9d4SSatish Balay     else if (nInt == 4) PackInit_IntegerType<int, 4, 1>(link);
6929371c9d4SSatish Balay     else if (nInt % 4 == 0) PackInit_IntegerType<int, 4, 0>(link);
6939371c9d4SSatish Balay     else if (nInt == 2) PackInit_IntegerType<int, 2, 1>(link);
6949371c9d4SSatish Balay     else if (nInt % 2 == 0) PackInit_IntegerType<int, 2, 0>(link);
6959371c9d4SSatish Balay     else if (nInt == 1) PackInit_IntegerType<int, 1, 1>(link);
6969371c9d4SSatish Balay     else if (nInt % 1 == 0)
697eee4e20aSJunchao Zhang #endif
698d941a2f0SJunchao Zhang       PackInit_IntegerType<int, 1, 0>(link);
699914b7a73SJunchao Zhang   } else if (nSignedChar) {
700d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE)
7019371c9d4SSatish Balay     if (nSignedChar == 8) PackInit_IntegerType<char, 8, 1>(link);
7029371c9d4SSatish Balay     else if (nSignedChar % 8 == 0) PackInit_IntegerType<char, 8, 0>(link);
7039371c9d4SSatish Balay     else if (nSignedChar == 4) PackInit_IntegerType<char, 4, 1>(link);
7049371c9d4SSatish Balay     else if (nSignedChar % 4 == 0) PackInit_IntegerType<char, 4, 0>(link);
7059371c9d4SSatish Balay     else if (nSignedChar == 2) PackInit_IntegerType<char, 2, 1>(link);
7069371c9d4SSatish Balay     else if (nSignedChar % 2 == 0) PackInit_IntegerType<char, 2, 0>(link);
7079371c9d4SSatish Balay     else if (nSignedChar == 1) PackInit_IntegerType<char, 1, 1>(link);
7089371c9d4SSatish Balay     else if (nSignedChar % 1 == 0)
709eee4e20aSJunchao Zhang #endif
710d941a2f0SJunchao Zhang       PackInit_IntegerType<char, 1, 0>(link);
711914b7a73SJunchao Zhang   } else if (nUnsignedChar) {
712d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE)
7139371c9d4SSatish Balay     if (nUnsignedChar == 8) PackInit_IntegerType<unsigned char, 8, 1>(link);
7149371c9d4SSatish Balay     else if (nUnsignedChar % 8 == 0) PackInit_IntegerType<unsigned char, 8, 0>(link);
7159371c9d4SSatish Balay     else if (nUnsignedChar == 4) PackInit_IntegerType<unsigned char, 4, 1>(link);
7169371c9d4SSatish Balay     else if (nUnsignedChar % 4 == 0) PackInit_IntegerType<unsigned char, 4, 0>(link);
7179371c9d4SSatish Balay     else if (nUnsignedChar == 2) PackInit_IntegerType<unsigned char, 2, 1>(link);
7189371c9d4SSatish Balay     else if (nUnsignedChar % 2 == 0) PackInit_IntegerType<unsigned char, 2, 0>(link);
7199371c9d4SSatish Balay     else if (nUnsignedChar == 1) PackInit_IntegerType<unsigned char, 1, 1>(link);
7209371c9d4SSatish Balay     else if (nUnsignedChar % 1 == 0)
721eee4e20aSJunchao Zhang #endif
722d941a2f0SJunchao Zhang       PackInit_IntegerType<unsigned char, 1, 0>(link);
723914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
724914b7a73SJunchao Zhang   } else if (nPetscComplex) {
725d941a2f0SJunchao Zhang   #if !defined(PETSC_HAVE_DEVICE)
7269371c9d4SSatish Balay     if (nPetscComplex == 8) PackInit_ComplexType<Kokkos::complex<PetscReal>, 8, 1>(link);
7279371c9d4SSatish Balay     else if (nPetscComplex % 8 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>, 8, 0>(link);
7289371c9d4SSatish Balay     else if (nPetscComplex == 4) PackInit_ComplexType<Kokkos::complex<PetscReal>, 4, 1>(link);
7299371c9d4SSatish Balay     else if (nPetscComplex % 4 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>, 4, 0>(link);
7309371c9d4SSatish Balay     else if (nPetscComplex == 2) PackInit_ComplexType<Kokkos::complex<PetscReal>, 2, 1>(link);
7319371c9d4SSatish Balay     else if (nPetscComplex % 2 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>, 2, 0>(link);
7329371c9d4SSatish Balay     else if (nPetscComplex == 1) PackInit_ComplexType<Kokkos::complex<PetscReal>, 1, 1>(link);
7339371c9d4SSatish Balay     else if (nPetscComplex % 1 == 0)
734eee4e20aSJunchao Zhang   #endif
735d941a2f0SJunchao Zhang       PackInit_ComplexType<Kokkos::complex<PetscReal>, 1, 0>(link);
736914b7a73SJunchao Zhang #endif
737914b7a73SJunchao Zhang   } else {
738914b7a73SJunchao Zhang     MPI_Aint lb, nbyte;
7399566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte));
74008401ef6SPierre Jolivet     PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
741914b7a73SJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
742d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE)
7439371c9d4SSatish Balay       if (nbyte == 4) PackInit_DumbType<char, 4, 1>(link);
7449371c9d4SSatish Balay       else if (nbyte % 4 == 0) PackInit_DumbType<char, 4, 0>(link);
7459371c9d4SSatish Balay       else if (nbyte == 2) PackInit_DumbType<char, 2, 1>(link);
7469371c9d4SSatish Balay       else if (nbyte % 2 == 0) PackInit_DumbType<char, 2, 0>(link);
7479371c9d4SSatish Balay       else if (nbyte == 1) PackInit_DumbType<char, 1, 1>(link);
7489371c9d4SSatish Balay       else if (nbyte % 1 == 0)
749eee4e20aSJunchao Zhang #endif
750d941a2f0SJunchao Zhang         PackInit_DumbType<char, 1, 0>(link);
751914b7a73SJunchao Zhang     } else {
752914b7a73SJunchao Zhang       nInt = nbyte / sizeof(int);
753d941a2f0SJunchao Zhang #if !defined(PETSC_HAVE_DEVICE)
7549371c9d4SSatish Balay       if (nInt == 8) PackInit_DumbType<int, 8, 1>(link);
7559371c9d4SSatish Balay       else if (nInt % 8 == 0) PackInit_DumbType<int, 8, 0>(link);
7569371c9d4SSatish Balay       else if (nInt == 4) PackInit_DumbType<int, 4, 1>(link);
7579371c9d4SSatish Balay       else if (nInt % 4 == 0) PackInit_DumbType<int, 4, 0>(link);
7589371c9d4SSatish Balay       else if (nInt == 2) PackInit_DumbType<int, 2, 1>(link);
7599371c9d4SSatish Balay       else if (nInt % 2 == 0) PackInit_DumbType<int, 2, 0>(link);
7609371c9d4SSatish Balay       else if (nInt == 1) PackInit_DumbType<int, 1, 1>(link);
7619371c9d4SSatish Balay       else if (nInt % 1 == 0)
762eee4e20aSJunchao Zhang #endif
763d941a2f0SJunchao Zhang         PackInit_DumbType<int, 1, 0>(link);
764914b7a73SJunchao Zhang     }
765914b7a73SJunchao Zhang   }
766914b7a73SJunchao Zhang 
76771438e86SJunchao Zhang   link->SyncDevice   = PetscSFLinkSyncDevice_Kokkos;
76871438e86SJunchao Zhang   link->SyncStream   = PetscSFLinkSyncStream_Kokkos;
76920c24465SJunchao Zhang   link->Memcpy       = PetscSFLinkMemcpy_Kokkos;
77071438e86SJunchao Zhang   link->Destroy      = PetscSFLinkDestroy_Kokkos;
771914b7a73SJunchao Zhang   link->deviceinited = PETSC_TRUE;
7723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
773914b7a73SJunchao Zhang }
774