xref: /petsc/src/vec/is/sf/impls/basic/kokkos/sfkok.kokkos.cxx (revision 08401ef684002a709c6d3db98a0c9f54a8bcf1ec)
1914b7a73SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
2914b7a73SJunchao Zhang 
3914b7a73SJunchao Zhang #include <Kokkos_Core.hpp>
4914b7a73SJunchao Zhang 
5914b7a73SJunchao Zhang using DeviceExecutionSpace = Kokkos::DefaultExecutionSpace;
6914b7a73SJunchao Zhang using DeviceMemorySpace    = typename DeviceExecutionSpace::memory_space;
7914b7a73SJunchao Zhang using HostMemorySpace      = Kokkos::HostSpace;
8914b7a73SJunchao Zhang 
9914b7a73SJunchao Zhang typedef Kokkos::View<char*,DeviceMemorySpace>       deviceBuffer_t;
10914b7a73SJunchao Zhang typedef Kokkos::View<char*,HostMemorySpace>         HostBuffer_t;
11914b7a73SJunchao Zhang 
12914b7a73SJunchao Zhang typedef Kokkos::View<const char*,DeviceMemorySpace> deviceConstBuffer_t;
13914b7a73SJunchao Zhang typedef Kokkos::View<const char*,HostMemorySpace>   HostConstBuffer_t;
14914b7a73SJunchao Zhang 
15914b7a73SJunchao Zhang /*====================================================================================*/
16914b7a73SJunchao Zhang /*                             Regular operations                           */
17914b7a73SJunchao Zhang /*====================================================================================*/
18914b7a73SJunchao Zhang template<typename Type> struct Insert{KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = y;             return old;}};
19914b7a73SJunchao Zhang template<typename Type> struct Add   {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x += y;             return old;}};
20914b7a73SJunchao Zhang template<typename Type> struct Mult  {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x *= y;             return old;}};
21914b7a73SJunchao Zhang template<typename Type> struct Min   {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = PetscMin(x,y); return old;}};
22914b7a73SJunchao Zhang template<typename Type> struct Max   {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = PetscMax(x,y); return old;}};
23914b7a73SJunchao Zhang template<typename Type> struct LAND  {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = x && y;        return old;}};
24914b7a73SJunchao Zhang template<typename Type> struct LOR   {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = x || y;        return old;}};
25914b7a73SJunchao Zhang template<typename Type> struct LXOR  {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = !x != !y;      return old;}};
26914b7a73SJunchao Zhang template<typename Type> struct BAND  {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = x & y;         return old;}};
27914b7a73SJunchao Zhang template<typename Type> struct BOR   {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = x | y;         return old;}};
28914b7a73SJunchao Zhang template<typename Type> struct BXOR  {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = x ^ y;         return old;}};
29914b7a73SJunchao Zhang template<typename PairType> struct Minloc {
30914b7a73SJunchao Zhang   KOKKOS_INLINE_FUNCTION PairType operator()(PairType& x,PairType y) const {
31914b7a73SJunchao Zhang     PairType old = x;
32914b7a73SJunchao Zhang     if (y.first < x.first) x = y;
33914b7a73SJunchao Zhang     else if (y.first == x.first) x.second = PetscMin(x.second,y.second);
34914b7a73SJunchao Zhang     return old;
35914b7a73SJunchao Zhang   }
36914b7a73SJunchao Zhang };
37914b7a73SJunchao Zhang template<typename PairType> struct Maxloc {
38914b7a73SJunchao Zhang   KOKKOS_INLINE_FUNCTION PairType operator()(PairType& x,PairType y) const {
39914b7a73SJunchao Zhang     PairType old = x;
40914b7a73SJunchao Zhang     if (y.first > x.first) x = y;
41914b7a73SJunchao Zhang     else if (y.first == x.first) x.second = PetscMin(x.second,y.second); /* See MPI MAXLOC */
42914b7a73SJunchao Zhang     return old;
43914b7a73SJunchao Zhang   }
44914b7a73SJunchao Zhang };
45914b7a73SJunchao Zhang 
46914b7a73SJunchao Zhang /*====================================================================================*/
47914b7a73SJunchao Zhang /*                             Atomic operations                            */
48914b7a73SJunchao Zhang /*====================================================================================*/
49914b7a73SJunchao Zhang template<typename Type> struct AtomicInsert  {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_assign(&x,y);}};
50914b7a73SJunchao Zhang template<typename Type> struct AtomicAdd     {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_add(&x,y);}};
51914b7a73SJunchao Zhang template<typename Type> struct AtomicBAND    {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_and(&x,y);}};
52914b7a73SJunchao Zhang template<typename Type> struct AtomicBOR     {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_or (&x,y);}};
53914b7a73SJunchao Zhang template<typename Type> struct AtomicBXOR    {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_fetch_xor(&x,y);}};
54914b7a73SJunchao Zhang template<typename Type> struct AtomicLAND    {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {const Type zero=0,one=~0; Kokkos::atomic_and(&x,y?one:zero);}};
55914b7a73SJunchao Zhang template<typename Type> struct AtomicLOR     {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {const Type zero=0,one=1;  Kokkos::atomic_or (&x,y?one:zero);}};
56914b7a73SJunchao Zhang template<typename Type> struct AtomicMult    {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_fetch_mul(&x,y);}};
57914b7a73SJunchao Zhang template<typename Type> struct AtomicMin     {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_fetch_min(&x,y);}};
58914b7a73SJunchao Zhang template<typename Type> struct AtomicMax     {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_fetch_max(&x,y);}};
59914b7a73SJunchao Zhang /* TODO: struct AtomicLXOR  */
60914b7a73SJunchao Zhang template<typename Type> struct AtomicFetchAdd{KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {return Kokkos::atomic_fetch_add(&x,y);}};
61914b7a73SJunchao Zhang 
62914b7a73SJunchao Zhang /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
63914b7a73SJunchao Zhang static KOKKOS_INLINE_FUNCTION PetscInt MapTidToIndex(const PetscInt *opt,PetscInt tid)
64914b7a73SJunchao Zhang {
65914b7a73SJunchao Zhang   PetscInt        i,j,k,m,n,r;
66914b7a73SJunchao Zhang   const PetscInt  *offset,*start,*dx,*dy,*X,*Y;
67914b7a73SJunchao Zhang 
68914b7a73SJunchao Zhang   n      = opt[0];
69914b7a73SJunchao Zhang   offset = opt + 1;
70914b7a73SJunchao Zhang   start  = opt + n + 2;
71914b7a73SJunchao Zhang   dx     = opt + 2*n + 2;
72914b7a73SJunchao Zhang   dy     = opt + 3*n + 2;
73914b7a73SJunchao Zhang   X      = opt + 5*n + 2;
74914b7a73SJunchao Zhang   Y      = opt + 6*n + 2;
75914b7a73SJunchao Zhang   for (r=0; r<n; r++) {if (tid < offset[r+1]) break;}
76914b7a73SJunchao Zhang   m = (tid - offset[r]);
77914b7a73SJunchao Zhang   k = m/(dx[r]*dy[r]);
78914b7a73SJunchao Zhang   j = (m - k*dx[r]*dy[r])/dx[r];
79914b7a73SJunchao Zhang   i = m - k*dx[r]*dy[r] - j*dx[r];
80914b7a73SJunchao Zhang 
81914b7a73SJunchao Zhang   return (start[r] + k*X[r]*Y[r] + j*X[r] + i);
82914b7a73SJunchao Zhang }
83914b7a73SJunchao Zhang 
84914b7a73SJunchao Zhang /*====================================================================================*/
85914b7a73SJunchao Zhang /*  Wrappers for Pack/Unpack/Scatter kernels. Function pointers are stored in 'link'         */
86914b7a73SJunchao Zhang /*====================================================================================*/
87914b7a73SJunchao Zhang 
88914b7a73SJunchao Zhang /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
89914b7a73SJunchao Zhang    <Type> is PetscReal, which is the primitive type we operate on.
90914b7a73SJunchao Zhang    <bs>   is 16, which says <unit> contains 16 primitive types.
91914b7a73SJunchao Zhang    <BS>   is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
92914b7a73SJunchao Zhang    <EQ>   is 0, which is (bs == BS ? 1 : 0)
93914b7a73SJunchao Zhang 
94914b7a73SJunchao Zhang   If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
95914b7a73SJunchao Zhang   For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
96914b7a73SJunchao Zhang */
97914b7a73SJunchao Zhang template<typename Type,PetscInt BS,PetscInt EQ>
98914b7a73SJunchao Zhang static PetscErrorCode Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *data_,void *buf_)
99914b7a73SJunchao Zhang {
100914b7a73SJunchao Zhang   const PetscInt          *iopt = opt ? opt->array : NULL;
101914b7a73SJunchao Zhang   const PetscInt          M = EQ ? 1 : link->bs/BS, MBS=M*BS; /* If EQ, then MBS will be a compile-time const */
102914b7a73SJunchao Zhang   const Type              *data = static_cast<const Type*>(data_);
103914b7a73SJunchao Zhang   Type                    *buf = static_cast<Type*>(buf_);
104f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
105914b7a73SJunchao Zhang 
106914b7a73SJunchao Zhang   PetscFunctionBegin;
107914b7a73SJunchao Zhang   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceExecutionSpace>(exec,0,count),KOKKOS_LAMBDA(PetscInt tid) {
108914b7a73SJunchao Zhang     /* iopt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
109914b7a73SJunchao Zhang        iopt == NULL && idx == NULL ==> the indices are contiguous;
110914b7a73SJunchao Zhang      */
111914b7a73SJunchao Zhang     PetscInt t = (iopt? MapTidToIndex(iopt,tid) : (idx? idx[tid] : start+tid))*MBS;
112914b7a73SJunchao Zhang     PetscInt s = tid*MBS;
113914b7a73SJunchao Zhang     for (int i=0; i<MBS; i++) buf[s+i] = data[t+i];
114914b7a73SJunchao Zhang   });
115914b7a73SJunchao Zhang   PetscFunctionReturn(0);
116914b7a73SJunchao Zhang }
117914b7a73SJunchao Zhang 
118914b7a73SJunchao Zhang template<typename Type,class Op,PetscInt BS,PetscInt EQ>
119914b7a73SJunchao Zhang static PetscErrorCode UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data_,const void *buf_)
120914b7a73SJunchao Zhang {
121914b7a73SJunchao Zhang   Op                      op;
122914b7a73SJunchao Zhang   const PetscInt          *iopt = opt ? opt->array : NULL;
123914b7a73SJunchao Zhang   const PetscInt          M = EQ ? 1 : link->bs/BS, MBS=M*BS;
124914b7a73SJunchao Zhang   Type                    *data = static_cast<Type*>(data_);
125914b7a73SJunchao Zhang   const Type              *buf = static_cast<const Type*>(buf_);
126f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
127914b7a73SJunchao Zhang 
128914b7a73SJunchao Zhang   PetscFunctionBegin;
129914b7a73SJunchao Zhang   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceExecutionSpace>(exec,0,count),KOKKOS_LAMBDA(PetscInt tid) {
130914b7a73SJunchao Zhang     PetscInt t = (iopt? MapTidToIndex(iopt,tid) : (idx? idx[tid] : start+tid))*MBS;
131914b7a73SJunchao Zhang     PetscInt s = tid*MBS;
132914b7a73SJunchao Zhang     for (int i=0; i<MBS; i++) op(data[t+i],buf[s+i]);
133914b7a73SJunchao Zhang   });
134914b7a73SJunchao Zhang   PetscFunctionReturn(0);
135914b7a73SJunchao Zhang }
136914b7a73SJunchao Zhang 
137914b7a73SJunchao Zhang template<typename Type,class Op,PetscInt BS,PetscInt EQ>
138914b7a73SJunchao Zhang static PetscErrorCode FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,void *buf)
139914b7a73SJunchao Zhang {
140914b7a73SJunchao Zhang   Op                      op;
141914b7a73SJunchao Zhang   const PetscInt          *ropt = opt ? opt->array : NULL;
142914b7a73SJunchao Zhang   const PetscInt          M = EQ ? 1 : link->bs/BS, MBS=M*BS;
143914b7a73SJunchao Zhang   Type                    *rootdata = static_cast<Type*>(data),*leafbuf=static_cast<Type*>(buf);
144f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
145914b7a73SJunchao Zhang 
146914b7a73SJunchao Zhang   PetscFunctionBegin;
147914b7a73SJunchao Zhang   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceExecutionSpace>(exec,0,count),KOKKOS_LAMBDA(PetscInt tid) {
148914b7a73SJunchao Zhang     PetscInt r = (ropt? MapTidToIndex(ropt,tid) : (idx? idx[tid] : start+tid))*MBS;
149914b7a73SJunchao Zhang     PetscInt l = tid*MBS;
150914b7a73SJunchao Zhang     for (int i=0; i<MBS; i++) leafbuf[l+i] = op(rootdata[r+i],leafbuf[l+i]);
151914b7a73SJunchao Zhang   });
152914b7a73SJunchao Zhang   PetscFunctionReturn(0);
153914b7a73SJunchao Zhang }
154914b7a73SJunchao Zhang 
155914b7a73SJunchao Zhang template<typename Type,class Op,PetscInt BS,PetscInt EQ>
156914b7a73SJunchao Zhang static PetscErrorCode ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src_,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst_)
157914b7a73SJunchao Zhang {
158914b7a73SJunchao Zhang   PetscInt                srcx=0,srcy=0,srcX=0,srcY=0,dstx=0,dsty=0,dstX=0,dstY=0;
159914b7a73SJunchao Zhang   const PetscInt          M = (EQ) ? 1 : link->bs/BS, MBS=M*BS;
160914b7a73SJunchao Zhang   const Type              *src = static_cast<const Type*>(src_);
161914b7a73SJunchao Zhang   Type                    *dst = static_cast<Type*>(dst_);
162f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
163914b7a73SJunchao Zhang 
164914b7a73SJunchao Zhang   PetscFunctionBegin;
165914b7a73SJunchao Zhang   /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
166914b7a73SJunchao Zhang   if (srcOpt)       {srcx = srcOpt->dx[0]; srcy = srcOpt->dy[0]; srcX = srcOpt->X[0]; srcY = srcOpt->Y[0]; srcStart = srcOpt->start[0]; srcIdx = NULL;}
167914b7a73SJunchao Zhang   else if (!srcIdx) {srcx = srcX = count; srcy = srcY = 1;}
168914b7a73SJunchao Zhang 
169914b7a73SJunchao Zhang   if (dstOpt)       {dstx = dstOpt->dx[0]; dsty = dstOpt->dy[0]; dstX = dstOpt->X[0]; dstY = dstOpt->Y[0]; dstStart = dstOpt->start[0]; dstIdx = NULL;}
170914b7a73SJunchao Zhang   else if (!dstIdx) {dstx = dstX = count; dsty = dstY = 1;}
171914b7a73SJunchao Zhang 
172914b7a73SJunchao Zhang   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceExecutionSpace>(exec,0,count),KOKKOS_LAMBDA(PetscInt tid) {
173914b7a73SJunchao Zhang     PetscInt i,j,k,s,t;
174914b7a73SJunchao Zhang     Op       op;
175914b7a73SJunchao Zhang     if (!srcIdx) { /* src is in 3D */
176914b7a73SJunchao Zhang       k = tid/(srcx*srcy);
177914b7a73SJunchao Zhang       j = (tid - k*srcx*srcy)/srcx;
178914b7a73SJunchao Zhang       i = tid - k*srcx*srcy - j*srcx;
179914b7a73SJunchao Zhang       s = srcStart + k*srcX*srcY + j*srcX + i;
180914b7a73SJunchao Zhang     } else { /* src is contiguous */
181914b7a73SJunchao Zhang       s = srcIdx[tid];
182914b7a73SJunchao Zhang     }
183914b7a73SJunchao Zhang 
184914b7a73SJunchao Zhang     if (!dstIdx) { /* 3D */
185914b7a73SJunchao Zhang       k = tid/(dstx*dsty);
186914b7a73SJunchao Zhang       j = (tid - k*dstx*dsty)/dstx;
187914b7a73SJunchao Zhang       i = tid - k*dstx*dsty - j*dstx;
188914b7a73SJunchao Zhang       t = dstStart + k*dstX*dstY + j*dstX + i;
189914b7a73SJunchao Zhang     } else { /* contiguous */
190914b7a73SJunchao Zhang       t = dstIdx[tid];
191914b7a73SJunchao Zhang     }
192914b7a73SJunchao Zhang 
193914b7a73SJunchao Zhang     s *= MBS;
194914b7a73SJunchao Zhang     t *= MBS;
195914b7a73SJunchao Zhang     for (i=0; i<MBS; i++) op(dst[t+i],src[s+i]);
196914b7a73SJunchao Zhang   });
197914b7a73SJunchao Zhang   PetscFunctionReturn(0);
198914b7a73SJunchao Zhang }
199914b7a73SJunchao Zhang 
200914b7a73SJunchao Zhang /* Specialization for Insert since we may use memcpy */
201914b7a73SJunchao Zhang template<typename Type,PetscInt BS,PetscInt EQ>
202914b7a73SJunchao Zhang static PetscErrorCode ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src_,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst_)
203914b7a73SJunchao Zhang {
204914b7a73SJunchao Zhang   const Type              *src = static_cast<const Type*>(src_);
205914b7a73SJunchao Zhang   Type                    *dst = static_cast<Type*>(dst_);
206f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
207914b7a73SJunchao Zhang 
208914b7a73SJunchao Zhang   PetscFunctionBegin;
209914b7a73SJunchao Zhang   if (!count) PetscFunctionReturn(0);
210914b7a73SJunchao Zhang   /*src and dst are contiguous */
211914b7a73SJunchao Zhang   if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
212914b7a73SJunchao Zhang     size_t sz = count*link->unitbytes;
213914b7a73SJunchao Zhang     deviceBuffer_t      dbuf(reinterpret_cast<char*>(dst+dstStart*link->bs),sz);
214914b7a73SJunchao Zhang     deviceConstBuffer_t sbuf(reinterpret_cast<const char*>(src+srcStart*link->bs),sz);
215914b7a73SJunchao Zhang     Kokkos::deep_copy(exec,dbuf,sbuf);
216914b7a73SJunchao Zhang   } else {
2179566063dSJacob Faibussowitsch     PetscCall(ScatterAndOp<Type,Insert<Type>,BS,EQ>(link,count,srcStart,srcOpt,srcIdx,src,dstStart,dstOpt,dstIdx,dst));
218914b7a73SJunchao Zhang   }
219914b7a73SJunchao Zhang   PetscFunctionReturn(0);
220914b7a73SJunchao Zhang }
221914b7a73SJunchao Zhang 
222914b7a73SJunchao Zhang template<typename Type,class Op,PetscInt BS,PetscInt EQ>
223914b7a73SJunchao Zhang static PetscErrorCode FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata_,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata_,void *leafupdate_)
224914b7a73SJunchao Zhang {
225914b7a73SJunchao Zhang   Op                      op;
226914b7a73SJunchao Zhang   const PetscInt          M = (EQ) ? 1 : link->bs/BS, MBS = M*BS;
227914b7a73SJunchao Zhang   const PetscInt          *ropt = rootopt ? rootopt->array : NULL;
228914b7a73SJunchao Zhang   const PetscInt          *lopt = leafopt ? leafopt->array : NULL;
229914b7a73SJunchao Zhang   Type                    *rootdata = static_cast<Type*>(rootdata_),*leafupdate = static_cast<Type*>(leafupdate_);
230914b7a73SJunchao Zhang   const Type              *leafdata = static_cast<const Type*>(leafdata_);
231f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
232914b7a73SJunchao Zhang 
233914b7a73SJunchao Zhang   PetscFunctionBegin;
234914b7a73SJunchao Zhang   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceExecutionSpace>(exec,0,count),KOKKOS_LAMBDA(PetscInt tid) {
235914b7a73SJunchao Zhang     PetscInt r = (ropt? MapTidToIndex(ropt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
236914b7a73SJunchao Zhang     PetscInt l = (lopt? MapTidToIndex(lopt,tid) : (leafidx? leafidx[tid] : leafstart+tid))*MBS;
237914b7a73SJunchao Zhang     for (int i=0; i<MBS; i++) leafupdate[l+i] = op(rootdata[r+i],leafdata[l+i]);
238914b7a73SJunchao Zhang   });
239914b7a73SJunchao Zhang   PetscFunctionReturn(0);
240914b7a73SJunchao Zhang }
241914b7a73SJunchao Zhang 
242914b7a73SJunchao Zhang /*====================================================================================*/
243914b7a73SJunchao Zhang /*  Init various types and instantiate pack/unpack function pointers                  */
244914b7a73SJunchao Zhang /*====================================================================================*/
245914b7a73SJunchao Zhang template<typename Type,PetscInt BS,PetscInt EQ>
246914b7a73SJunchao Zhang static void PackInit_RealType(PetscSFLink link)
247914b7a73SJunchao Zhang {
248914b7a73SJunchao Zhang   /* Pack/unpack for remote communication */
249914b7a73SJunchao Zhang   link->d_Pack              = Pack<Type,BS,EQ>;
250914b7a73SJunchao Zhang   link->d_UnpackAndInsert   = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
251914b7a73SJunchao Zhang   link->d_UnpackAndAdd      = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
252914b7a73SJunchao Zhang   link->d_UnpackAndMult     = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
253914b7a73SJunchao Zhang   link->d_UnpackAndMin      = UnpackAndOp<Type,Min<Type>   ,BS,EQ>;
254914b7a73SJunchao Zhang   link->d_UnpackAndMax      = UnpackAndOp<Type,Max<Type>   ,BS,EQ>;
255914b7a73SJunchao Zhang   link->d_FetchAndAdd       = FetchAndOp <Type,Add<Type>   ,BS,EQ>;
256914b7a73SJunchao Zhang   /* Scatter for local communication */
257914b7a73SJunchao Zhang   link->d_ScatterAndInsert  = ScatterAndInsert<Type,BS,EQ>; /* Has special optimizations */
258914b7a73SJunchao Zhang   link->d_ScatterAndAdd     = ScatterAndOp<Type,Add<Type>    ,BS,EQ>;
259914b7a73SJunchao Zhang   link->d_ScatterAndMult    = ScatterAndOp<Type,Mult<Type>   ,BS,EQ>;
260914b7a73SJunchao Zhang   link->d_ScatterAndMin     = ScatterAndOp<Type,Min<Type>    ,BS,EQ>;
261914b7a73SJunchao Zhang   link->d_ScatterAndMax     = ScatterAndOp<Type,Max<Type>    ,BS,EQ>;
262914b7a73SJunchao Zhang   link->d_FetchAndAddLocal  = FetchAndOpLocal<Type,Add <Type>,BS,EQ>;
263914b7a73SJunchao Zhang   /* Atomic versions when there are data-race possibilities */
264914b7a73SJunchao Zhang   link->da_UnpackAndInsert  = UnpackAndOp<Type,AtomicInsert<Type>  ,BS,EQ>;
265914b7a73SJunchao Zhang   link->da_UnpackAndAdd     = UnpackAndOp<Type,AtomicAdd<Type>     ,BS,EQ>;
266914b7a73SJunchao Zhang   link->da_UnpackAndMult    = UnpackAndOp<Type,AtomicMult<Type>    ,BS,EQ>;
267914b7a73SJunchao Zhang   link->da_UnpackAndMin     = UnpackAndOp<Type,AtomicMin<Type>     ,BS,EQ>;
268914b7a73SJunchao Zhang   link->da_UnpackAndMax     = UnpackAndOp<Type,AtomicMax<Type>     ,BS,EQ>;
269914b7a73SJunchao Zhang   link->da_FetchAndAdd      = FetchAndOp <Type,AtomicFetchAdd<Type>,BS,EQ>;
270914b7a73SJunchao Zhang 
271914b7a73SJunchao Zhang   link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
272914b7a73SJunchao Zhang   link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
273914b7a73SJunchao Zhang   link->da_ScatterAndMult   = ScatterAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
274914b7a73SJunchao Zhang   link->da_ScatterAndMin    = ScatterAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
275914b7a73SJunchao Zhang   link->da_ScatterAndMax    = ScatterAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
276914b7a73SJunchao Zhang   link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicFetchAdd<Type>,BS,EQ>;
277914b7a73SJunchao Zhang }
278914b7a73SJunchao Zhang 
279914b7a73SJunchao Zhang template<typename Type,PetscInt BS,PetscInt EQ>
280914b7a73SJunchao Zhang static void PackInit_IntegerType(PetscSFLink link)
281914b7a73SJunchao Zhang {
282914b7a73SJunchao Zhang   link->d_Pack              = Pack<Type,BS,EQ>;
283914b7a73SJunchao Zhang   link->d_UnpackAndInsert   = UnpackAndOp<Type,Insert<Type> ,BS,EQ>;
284914b7a73SJunchao Zhang   link->d_UnpackAndAdd      = UnpackAndOp<Type,Add<Type>    ,BS,EQ>;
285914b7a73SJunchao Zhang   link->d_UnpackAndMult     = UnpackAndOp<Type,Mult<Type>   ,BS,EQ>;
286914b7a73SJunchao Zhang   link->d_UnpackAndMin      = UnpackAndOp<Type,Min<Type>    ,BS,EQ>;
287914b7a73SJunchao Zhang   link->d_UnpackAndMax      = UnpackAndOp<Type,Max<Type>    ,BS,EQ>;
288914b7a73SJunchao Zhang   link->d_UnpackAndLAND     = UnpackAndOp<Type,LAND<Type>   ,BS,EQ>;
289914b7a73SJunchao Zhang   link->d_UnpackAndLOR      = UnpackAndOp<Type,LOR<Type>    ,BS,EQ>;
290914b7a73SJunchao Zhang   link->d_UnpackAndLXOR     = UnpackAndOp<Type,LXOR<Type>   ,BS,EQ>;
291914b7a73SJunchao Zhang   link->d_UnpackAndBAND     = UnpackAndOp<Type,BAND<Type>   ,BS,EQ>;
292914b7a73SJunchao Zhang   link->d_UnpackAndBOR      = UnpackAndOp<Type,BOR<Type>    ,BS,EQ>;
293914b7a73SJunchao Zhang   link->d_UnpackAndBXOR     = UnpackAndOp<Type,BXOR<Type>   ,BS,EQ>;
294914b7a73SJunchao Zhang   link->d_FetchAndAdd       = FetchAndOp <Type,Add<Type>    ,BS,EQ>;
295914b7a73SJunchao Zhang 
296914b7a73SJunchao Zhang   link->d_ScatterAndInsert  = ScatterAndInsert<Type,BS,EQ>;
297914b7a73SJunchao Zhang   link->d_ScatterAndAdd     = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
298914b7a73SJunchao Zhang   link->d_ScatterAndMult    = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
299914b7a73SJunchao Zhang   link->d_ScatterAndMin     = ScatterAndOp<Type,Min<Type>   ,BS,EQ>;
300914b7a73SJunchao Zhang   link->d_ScatterAndMax     = ScatterAndOp<Type,Max<Type>   ,BS,EQ>;
301914b7a73SJunchao Zhang   link->d_ScatterAndLAND    = ScatterAndOp<Type,LAND<Type>  ,BS,EQ>;
302914b7a73SJunchao Zhang   link->d_ScatterAndLOR     = ScatterAndOp<Type,LOR<Type>   ,BS,EQ>;
303914b7a73SJunchao Zhang   link->d_ScatterAndLXOR    = ScatterAndOp<Type,LXOR<Type>  ,BS,EQ>;
304914b7a73SJunchao Zhang   link->d_ScatterAndBAND    = ScatterAndOp<Type,BAND<Type>  ,BS,EQ>;
305914b7a73SJunchao Zhang   link->d_ScatterAndBOR     = ScatterAndOp<Type,BOR<Type>   ,BS,EQ>;
306914b7a73SJunchao Zhang   link->d_ScatterAndBXOR    = ScatterAndOp<Type,BXOR<Type>  ,BS,EQ>;
307914b7a73SJunchao Zhang   link->d_FetchAndAddLocal  = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
308914b7a73SJunchao Zhang 
309914b7a73SJunchao Zhang   link->da_UnpackAndInsert  = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
310914b7a73SJunchao Zhang   link->da_UnpackAndAdd     = UnpackAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
311914b7a73SJunchao Zhang   link->da_UnpackAndMult    = UnpackAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
312914b7a73SJunchao Zhang   link->da_UnpackAndMin     = UnpackAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
313914b7a73SJunchao Zhang   link->da_UnpackAndMax     = UnpackAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
314914b7a73SJunchao Zhang   link->da_UnpackAndLAND    = UnpackAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
315914b7a73SJunchao Zhang   link->da_UnpackAndLOR     = UnpackAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
316914b7a73SJunchao Zhang   link->da_UnpackAndBAND    = UnpackAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
317914b7a73SJunchao Zhang   link->da_UnpackAndBOR     = UnpackAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
318914b7a73SJunchao Zhang   link->da_UnpackAndBXOR    = UnpackAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
319914b7a73SJunchao Zhang   link->da_FetchAndAdd      = FetchAndOp <Type,AtomicFetchAdd<Type>,BS,EQ>;
320914b7a73SJunchao Zhang 
321914b7a73SJunchao Zhang   link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
322914b7a73SJunchao Zhang   link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
323914b7a73SJunchao Zhang   link->da_ScatterAndMult   = ScatterAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
324914b7a73SJunchao Zhang   link->da_ScatterAndMin    = ScatterAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
325914b7a73SJunchao Zhang   link->da_ScatterAndMax    = ScatterAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
326914b7a73SJunchao Zhang   link->da_ScatterAndLAND   = ScatterAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
327914b7a73SJunchao Zhang   link->da_ScatterAndLOR    = ScatterAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
328914b7a73SJunchao Zhang   link->da_ScatterAndBAND   = ScatterAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
329914b7a73SJunchao Zhang   link->da_ScatterAndBOR    = ScatterAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
330914b7a73SJunchao Zhang   link->da_ScatterAndBXOR   = ScatterAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
331914b7a73SJunchao Zhang   link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicFetchAdd<Type>,BS,EQ>;
332914b7a73SJunchao Zhang }
333914b7a73SJunchao Zhang 
334914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
335914b7a73SJunchao Zhang template<typename Type,PetscInt BS,PetscInt EQ>
336914b7a73SJunchao Zhang static void PackInit_ComplexType(PetscSFLink link)
337914b7a73SJunchao Zhang {
338914b7a73SJunchao Zhang   link->d_Pack             = Pack<Type,BS,EQ>;
339914b7a73SJunchao Zhang   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
340914b7a73SJunchao Zhang   link->d_UnpackAndAdd     = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
341914b7a73SJunchao Zhang   link->d_UnpackAndMult    = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
342914b7a73SJunchao Zhang   link->d_FetchAndAdd      = FetchAndOp <Type,Add<Type>   ,BS,EQ>;
343914b7a73SJunchao Zhang 
344914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
345914b7a73SJunchao Zhang   link->d_ScatterAndAdd    = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
346914b7a73SJunchao Zhang   link->d_ScatterAndMult   = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
347914b7a73SJunchao Zhang   link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
348914b7a73SJunchao Zhang 
349914b7a73SJunchao Zhang   link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type> ,BS,EQ>;
350914b7a73SJunchao Zhang   link->da_UnpackAndAdd    = UnpackAndOp<Type,AtomicAdd<Type>    ,BS,EQ>;
351914b7a73SJunchao Zhang   link->da_UnpackAndMult   = UnpackAndOp<Type,AtomicMult<Type>   ,BS,EQ>;
352914b7a73SJunchao Zhang   link->da_FetchAndAdd     = FetchAndOp<Type,AtomicFetchAdd<Type>,BS,EQ>;
353914b7a73SJunchao Zhang 
354914b7a73SJunchao Zhang   link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
355914b7a73SJunchao Zhang   link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
356914b7a73SJunchao Zhang   link->da_ScatterAndMult   = ScatterAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
357914b7a73SJunchao Zhang   link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicFetchAdd<Type>,BS,EQ>;
358914b7a73SJunchao Zhang }
359914b7a73SJunchao Zhang #endif
360914b7a73SJunchao Zhang 
361914b7a73SJunchao Zhang template<typename Type>
362914b7a73SJunchao Zhang static void PackInit_PairType(PetscSFLink link)
363914b7a73SJunchao Zhang {
364914b7a73SJunchao Zhang   link->d_Pack             = Pack<Type,1,1>;
365914b7a73SJunchao Zhang   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,1,1>;
366914b7a73SJunchao Zhang   link->d_UnpackAndMaxloc  = UnpackAndOp<Type,Maxloc<Type>,1,1>;
367914b7a73SJunchao Zhang   link->d_UnpackAndMinloc  = UnpackAndOp<Type,Minloc<Type>,1,1>;
368914b7a73SJunchao Zhang 
369914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndOp<Type,Insert<Type>,1,1>;
370914b7a73SJunchao Zhang   link->d_ScatterAndMaxloc = ScatterAndOp<Type,Maxloc<Type>,1,1>;
371914b7a73SJunchao Zhang   link->d_ScatterAndMinloc = ScatterAndOp<Type,Minloc<Type>,1,1>;
372914b7a73SJunchao Zhang   /* Atomics for pair types are not implemented yet */
373914b7a73SJunchao Zhang }
374914b7a73SJunchao Zhang 
375914b7a73SJunchao Zhang template<typename Type,PetscInt BS,PetscInt EQ>
376914b7a73SJunchao Zhang static void PackInit_DumbType(PetscSFLink link)
377914b7a73SJunchao Zhang {
378914b7a73SJunchao Zhang   link->d_Pack             = Pack<Type,BS,EQ>;
379914b7a73SJunchao Zhang   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
380914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
381914b7a73SJunchao Zhang   /* Atomics for dumb types are not implemented yet */
382914b7a73SJunchao Zhang }
383914b7a73SJunchao Zhang 
384f4af43b4SJunchao Zhang /*
385f4af43b4SJunchao Zhang   Kokkos::DefaultExecutionSpace(stream) is a reference counted pointer object. It has a bug
386f4af43b4SJunchao Zhang   that one is not able to repeatedly create and destroy the object. SF's original design was each
387f4af43b4SJunchao Zhang   SFLink has a stream (NULL or not) and hence an execution space object. The bug prevents us from
388f4af43b4SJunchao Zhang   destroying multiple SFLinks with NULL stream and the default execution space object. To avoid
389f4af43b4SJunchao Zhang   memory leaks, SF_Kokkos only supports NULL stream, which is also petsc's default scheme. SF_Kokkos
390f4af43b4SJunchao Zhang   does not do its own new/delete. It just uses Kokkos::DefaultExecutionSpace(), which is a singliton
391f4af43b4SJunchao Zhang   object in Kokkos.
392f4af43b4SJunchao Zhang */
393f4af43b4SJunchao Zhang /*
394914b7a73SJunchao Zhang static PetscErrorCode PetscSFLinkDestroy_Kokkos(PetscSFLink link)
395914b7a73SJunchao Zhang {
396914b7a73SJunchao Zhang   PetscFunctionBegin;
397914b7a73SJunchao Zhang   PetscFunctionReturn(0);
398914b7a73SJunchao Zhang }
399f4af43b4SJunchao Zhang */
400914b7a73SJunchao Zhang 
401914b7a73SJunchao Zhang /* Some device-specific utilities */
402a4af0ceeSJacob Faibussowitsch static PetscErrorCode PetscSFLinkSyncDevice_Kokkos(PetscSFLink PETSC_UNUSED link)
403914b7a73SJunchao Zhang {
404914b7a73SJunchao Zhang   PetscFunctionBegin;
405914b7a73SJunchao Zhang   Kokkos::fence();
406914b7a73SJunchao Zhang   PetscFunctionReturn(0);
407914b7a73SJunchao Zhang }
408914b7a73SJunchao Zhang 
409a4af0ceeSJacob Faibussowitsch static PetscErrorCode PetscSFLinkSyncStream_Kokkos(PetscSFLink PETSC_UNUSED link)
410914b7a73SJunchao Zhang {
411f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
412914b7a73SJunchao Zhang   PetscFunctionBegin;
413914b7a73SJunchao Zhang   exec.fence();
414914b7a73SJunchao Zhang   PetscFunctionReturn(0);
415914b7a73SJunchao Zhang }
416914b7a73SJunchao Zhang 
417a4af0ceeSJacob Faibussowitsch static PetscErrorCode PetscSFLinkMemcpy_Kokkos(PetscSFLink PETSC_UNUSED link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
418914b7a73SJunchao Zhang {
419f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
420914b7a73SJunchao Zhang 
421914b7a73SJunchao Zhang   PetscFunctionBegin;
422914b7a73SJunchao Zhang   if (!n) PetscFunctionReturn(0);
42371438e86SJunchao Zhang   if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) {
4249566063dSJacob Faibussowitsch     PetscCall(PetscMemcpy(dst,src,n));
425914b7a73SJunchao Zhang   } else {
42671438e86SJunchao Zhang     if (PetscMemTypeDevice(dstmtype) && PetscMemTypeHost(srcmtype)) {
427914b7a73SJunchao Zhang       deviceBuffer_t       dbuf(static_cast<char*>(dst),n);
428914b7a73SJunchao Zhang       HostConstBuffer_t    sbuf(static_cast<const char*>(src),n);
429914b7a73SJunchao Zhang       Kokkos::deep_copy(exec,dbuf,sbuf);
4309566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu(n));
43171438e86SJunchao Zhang     } else if (PetscMemTypeHost(dstmtype) && PetscMemTypeDevice(srcmtype)) {
432914b7a73SJunchao Zhang       HostBuffer_t         dbuf(static_cast<char*>(dst),n);
433914b7a73SJunchao Zhang       deviceConstBuffer_t  sbuf(static_cast<const char*>(src),n);
434914b7a73SJunchao Zhang       Kokkos::deep_copy(exec,dbuf,sbuf);
4359566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuToCpu(n));
43671438e86SJunchao Zhang     } else if (PetscMemTypeDevice(dstmtype) && PetscMemTypeDevice(srcmtype)) {
437914b7a73SJunchao Zhang       deviceBuffer_t       dbuf(static_cast<char*>(dst),n);
438914b7a73SJunchao Zhang       deviceConstBuffer_t  sbuf(static_cast<const char*>(src),n);
439914b7a73SJunchao Zhang       Kokkos::deep_copy(exec,dbuf,sbuf);
440914b7a73SJunchao Zhang     }
441914b7a73SJunchao Zhang   }
442914b7a73SJunchao Zhang   PetscFunctionReturn(0);
443914b7a73SJunchao Zhang }
444914b7a73SJunchao Zhang 
44520c24465SJunchao Zhang PetscErrorCode PetscSFMalloc_Kokkos(PetscMemType mtype,size_t size,void** ptr)
446914b7a73SJunchao Zhang {
447914b7a73SJunchao Zhang   PetscFunctionBegin;
4489566063dSJacob Faibussowitsch   if (PetscMemTypeHost(mtype)) PetscCall(PetscMalloc(size,ptr));
44971438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype)) {
4509566063dSJacob Faibussowitsch     if (!PetscKokkosInitialized) PetscCall(PetscKokkosInitializeCheck());
45145639126SStefano Zampini     *ptr = Kokkos::kokkos_malloc<DeviceMemorySpace>(size);
45298921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d", (int)mtype);
453914b7a73SJunchao Zhang   PetscFunctionReturn(0);
454914b7a73SJunchao Zhang }
455914b7a73SJunchao Zhang 
45620c24465SJunchao Zhang PetscErrorCode PetscSFFree_Kokkos(PetscMemType mtype,void* ptr)
457914b7a73SJunchao Zhang {
458914b7a73SJunchao Zhang   PetscFunctionBegin;
4599566063dSJacob Faibussowitsch   if (PetscMemTypeHost(mtype)) PetscCall(PetscFree(ptr));
46071438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype)) {Kokkos::kokkos_free<DeviceMemorySpace>(ptr);}
46198921bdaSJacob Faibussowitsch   else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d",(int)mtype);
462914b7a73SJunchao Zhang   PetscFunctionReturn(0);
463914b7a73SJunchao Zhang }
464914b7a73SJunchao Zhang 
46571438e86SJunchao Zhang /* Destructor when the link uses MPI for communication */
46671438e86SJunchao Zhang static PetscErrorCode PetscSFLinkDestroy_Kokkos(PetscSF sf,PetscSFLink link)
46771438e86SJunchao Zhang {
46871438e86SJunchao Zhang   PetscFunctionBegin;
46971438e86SJunchao Zhang   for (int i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
4709566063dSJacob Faibussowitsch     PetscCall(PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]));
4719566063dSJacob Faibussowitsch     PetscCall(PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]));
47271438e86SJunchao Zhang   }
47371438e86SJunchao Zhang   PetscFunctionReturn(0);
47471438e86SJunchao Zhang }
475914b7a73SJunchao Zhang 
476914b7a73SJunchao Zhang /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
477a4af0ceeSJacob Faibussowitsch PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF PETSC_UNUSED sf,PetscSFLink link,MPI_Datatype unit)
478914b7a73SJunchao Zhang {
479914b7a73SJunchao Zhang   PetscInt           nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
480914b7a73SJunchao Zhang   PetscBool          is2Int,is2PetscInt;
481914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
482914b7a73SJunchao Zhang   PetscInt           nPetscComplex=0;
483914b7a73SJunchao Zhang #endif
484914b7a73SJunchao Zhang 
485914b7a73SJunchao Zhang   PetscFunctionBegin;
486914b7a73SJunchao Zhang   if (link->deviceinited) PetscFunctionReturn(0);
4879566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
4889566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar));
4899566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar));
490914b7a73SJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
4919566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt));
4929566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt));
4939566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal));
494914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
4959566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex));
496914b7a73SJunchao Zhang #endif
4979566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int));
4989566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt));
499914b7a73SJunchao Zhang 
500914b7a73SJunchao Zhang   if (is2Int) {
501914b7a73SJunchao Zhang     PackInit_PairType<Kokkos::pair<int,int>>(link);
502914b7a73SJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
503914b7a73SJunchao Zhang     PackInit_PairType<Kokkos::pair<PetscInt,PetscInt>>(link);
504914b7a73SJunchao Zhang   } else if (nPetscReal) {
505d941a2f0SJunchao Zhang    #if !defined(PETSC_HAVE_DEVICE)  /* Skip the unimportant stuff to speed up SF device compilation time */
506914b7a73SJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType<PetscReal,8,1>(link); else if (nPetscReal%8 == 0) PackInit_RealType<PetscReal,8,0>(link);
507914b7a73SJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType<PetscReal,4,1>(link); else if (nPetscReal%4 == 0) PackInit_RealType<PetscReal,4,0>(link);
508914b7a73SJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType<PetscReal,2,1>(link); else if (nPetscReal%2 == 0) PackInit_RealType<PetscReal,2,0>(link);
509d941a2f0SJunchao Zhang     else if (nPetscReal == 1) PackInit_RealType<PetscReal,1,1>(link); else if (nPetscReal%1 == 0)
510eee4e20aSJunchao Zhang    #endif
511d941a2f0SJunchao Zhang     PackInit_RealType<PetscReal,1,0>(link);
512874d28e3SJunchao Zhang   } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
513d941a2f0SJunchao Zhang    #if !defined(PETSC_HAVE_DEVICE)
514874d28e3SJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType<llint,8,1>(link); else if (nPetscInt%8 == 0) PackInit_IntegerType<llint,8,0>(link);
515874d28e3SJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType<llint,4,1>(link); else if (nPetscInt%4 == 0) PackInit_IntegerType<llint,4,0>(link);
516874d28e3SJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType<llint,2,1>(link); else if (nPetscInt%2 == 0) PackInit_IntegerType<llint,2,0>(link);
517d941a2f0SJunchao Zhang     else if (nPetscInt == 1) PackInit_IntegerType<llint,1,1>(link); else if (nPetscInt%1 == 0)
518eee4e20aSJunchao Zhang    #endif
519d941a2f0SJunchao Zhang     PackInit_IntegerType<llint,1,0>(link);
520914b7a73SJunchao Zhang   } else if (nInt) {
521d941a2f0SJunchao Zhang    #if !defined(PETSC_HAVE_DEVICE)
522914b7a73SJunchao Zhang     if      (nInt == 8) PackInit_IntegerType<int,8,1>(link); else if (nInt%8 == 0) PackInit_IntegerType<int,8,0>(link);
523914b7a73SJunchao Zhang     else if (nInt == 4) PackInit_IntegerType<int,4,1>(link); else if (nInt%4 == 0) PackInit_IntegerType<int,4,0>(link);
524914b7a73SJunchao Zhang     else if (nInt == 2) PackInit_IntegerType<int,2,1>(link); else if (nInt%2 == 0) PackInit_IntegerType<int,2,0>(link);
525d941a2f0SJunchao Zhang     else if (nInt == 1) PackInit_IntegerType<int,1,1>(link); else if (nInt%1 == 0)
526eee4e20aSJunchao Zhang    #endif
527d941a2f0SJunchao Zhang     PackInit_IntegerType<int,1,0>(link);
528914b7a73SJunchao Zhang   } else if (nSignedChar) {
529d941a2f0SJunchao Zhang    #if !defined(PETSC_HAVE_DEVICE)
530914b7a73SJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType<char,8,1>(link); else if (nSignedChar%8 == 0) PackInit_IntegerType<char,8,0>(link);
531914b7a73SJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType<char,4,1>(link); else if (nSignedChar%4 == 0) PackInit_IntegerType<char,4,0>(link);
532914b7a73SJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType<char,2,1>(link); else if (nSignedChar%2 == 0) PackInit_IntegerType<char,2,0>(link);
533d941a2f0SJunchao Zhang     else if (nSignedChar == 1) PackInit_IntegerType<char,1,1>(link); else if (nSignedChar%1 == 0)
534eee4e20aSJunchao Zhang    #endif
535d941a2f0SJunchao Zhang     PackInit_IntegerType<char,1,0>(link);
536914b7a73SJunchao Zhang   }  else if (nUnsignedChar) {
537d941a2f0SJunchao Zhang    #if !defined(PETSC_HAVE_DEVICE)
538914b7a73SJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType<unsigned char,8,1>(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType<unsigned char,8,0>(link);
539914b7a73SJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType<unsigned char,4,1>(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType<unsigned char,4,0>(link);
540914b7a73SJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType<unsigned char,2,1>(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType<unsigned char,2,0>(link);
541d941a2f0SJunchao Zhang     else if (nUnsignedChar == 1) PackInit_IntegerType<unsigned char,1,1>(link); else if (nUnsignedChar%1 == 0)
542eee4e20aSJunchao Zhang    #endif
543d941a2f0SJunchao Zhang     PackInit_IntegerType<unsigned char,1,0>(link);
544914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
545914b7a73SJunchao Zhang   } else if (nPetscComplex) {
546d941a2f0SJunchao Zhang    #if !defined(PETSC_HAVE_DEVICE)
547914b7a73SJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType<Kokkos::complex<PetscReal>,8,1>(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>,8,0>(link);
548914b7a73SJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType<Kokkos::complex<PetscReal>,4,1>(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>,4,0>(link);
549914b7a73SJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType<Kokkos::complex<PetscReal>,2,1>(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>,2,0>(link);
550d941a2f0SJunchao Zhang     else if (nPetscComplex == 1) PackInit_ComplexType<Kokkos::complex<PetscReal>,1,1>(link); else if (nPetscComplex%1 == 0)
551eee4e20aSJunchao Zhang    #endif
552d941a2f0SJunchao Zhang     PackInit_ComplexType<Kokkos::complex<PetscReal>,1,0>(link);
553914b7a73SJunchao Zhang #endif
554914b7a73SJunchao Zhang   } else {
555914b7a73SJunchao Zhang     MPI_Aint lb,nbyte;
5569566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Type_get_extent(unit,&lb,&nbyte));
557*08401ef6SPierre Jolivet     PetscCheck(lb == 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld",(long)lb);
558914b7a73SJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
559d941a2f0SJunchao Zhang      #if !defined(PETSC_HAVE_DEVICE)
560914b7a73SJunchao Zhang       if      (nbyte == 4) PackInit_DumbType<char,4,1>(link); else if (nbyte%4 == 0) PackInit_DumbType<char,4,0>(link);
561914b7a73SJunchao Zhang       else if (nbyte == 2) PackInit_DumbType<char,2,1>(link); else if (nbyte%2 == 0) PackInit_DumbType<char,2,0>(link);
562d941a2f0SJunchao Zhang       else if (nbyte == 1) PackInit_DumbType<char,1,1>(link); else if (nbyte%1 == 0)
563eee4e20aSJunchao Zhang      #endif
564d941a2f0SJunchao Zhang       PackInit_DumbType<char,1,0>(link);
565914b7a73SJunchao Zhang     } else {
566914b7a73SJunchao Zhang       nInt = nbyte / sizeof(int);
567d941a2f0SJunchao Zhang      #if !defined(PETSC_HAVE_DEVICE)
568914b7a73SJunchao Zhang       if      (nInt == 8) PackInit_DumbType<int,8,1>(link); else if (nInt%8 == 0) PackInit_DumbType<int,8,0>(link);
569914b7a73SJunchao Zhang       else if (nInt == 4) PackInit_DumbType<int,4,1>(link); else if (nInt%4 == 0) PackInit_DumbType<int,4,0>(link);
570914b7a73SJunchao Zhang       else if (nInt == 2) PackInit_DumbType<int,2,1>(link); else if (nInt%2 == 0) PackInit_DumbType<int,2,0>(link);
571d941a2f0SJunchao Zhang       else if (nInt == 1) PackInit_DumbType<int,1,1>(link); else if (nInt%1 == 0)
572eee4e20aSJunchao Zhang      #endif
573d941a2f0SJunchao Zhang       PackInit_DumbType<int,1,0>(link);
574914b7a73SJunchao Zhang     }
575914b7a73SJunchao Zhang   }
576914b7a73SJunchao Zhang 
57771438e86SJunchao Zhang   link->SyncDevice   = PetscSFLinkSyncDevice_Kokkos;
57871438e86SJunchao Zhang   link->SyncStream   = PetscSFLinkSyncStream_Kokkos;
57920c24465SJunchao Zhang   link->Memcpy       = PetscSFLinkMemcpy_Kokkos;
58071438e86SJunchao Zhang   link->Destroy      = PetscSFLinkDestroy_Kokkos;
581914b7a73SJunchao Zhang   link->deviceinited = PETSC_TRUE;
582914b7a73SJunchao Zhang   PetscFunctionReturn(0);
583914b7a73SJunchao Zhang }
584