xref: /petsc/src/vec/is/sf/impls/basic/kokkos/sfkok.kokkos.cxx (revision eee4e20aa5876b4ea4bd0a86597027cabe16869d)
1914b7a73SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
2914b7a73SJunchao Zhang 
3914b7a73SJunchao Zhang #include <Kokkos_Core.hpp>
4914b7a73SJunchao Zhang 
5914b7a73SJunchao Zhang using DeviceExecutionSpace = Kokkos::DefaultExecutionSpace;
6914b7a73SJunchao Zhang using DeviceMemorySpace    = typename DeviceExecutionSpace::memory_space;
7914b7a73SJunchao Zhang using HostMemorySpace      = Kokkos::HostSpace;
8914b7a73SJunchao Zhang 
9914b7a73SJunchao Zhang typedef Kokkos::View<char*,DeviceMemorySpace>       deviceBuffer_t;
10914b7a73SJunchao Zhang typedef Kokkos::View<char*,HostMemorySpace>         HostBuffer_t;
11914b7a73SJunchao Zhang 
12914b7a73SJunchao Zhang typedef Kokkos::View<const char*,DeviceMemorySpace> deviceConstBuffer_t;
13914b7a73SJunchao Zhang typedef Kokkos::View<const char*,HostMemorySpace>   HostConstBuffer_t;
14914b7a73SJunchao Zhang 
15914b7a73SJunchao Zhang /*====================================================================================*/
16914b7a73SJunchao Zhang /*                             Regular operations                           */
17914b7a73SJunchao Zhang /*====================================================================================*/
18914b7a73SJunchao Zhang template<typename Type> struct Insert{KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = y;             return old;}};
19914b7a73SJunchao Zhang template<typename Type> struct Add   {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x += y;             return old;}};
20914b7a73SJunchao Zhang template<typename Type> struct Mult  {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x *= y;             return old;}};
21914b7a73SJunchao Zhang template<typename Type> struct Min   {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = PetscMin(x,y); return old;}};
22914b7a73SJunchao Zhang template<typename Type> struct Max   {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = PetscMax(x,y); return old;}};
23914b7a73SJunchao Zhang template<typename Type> struct LAND  {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = x && y;        return old;}};
24914b7a73SJunchao Zhang template<typename Type> struct LOR   {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = x || y;        return old;}};
25914b7a73SJunchao Zhang template<typename Type> struct LXOR  {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = !x != !y;      return old;}};
26914b7a73SJunchao Zhang template<typename Type> struct BAND  {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = x & y;         return old;}};
27914b7a73SJunchao Zhang template<typename Type> struct BOR   {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = x | y;         return old;}};
28914b7a73SJunchao Zhang template<typename Type> struct BXOR  {KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {Type old = x; x  = x ^ y;         return old;}};
29914b7a73SJunchao Zhang template<typename PairType> struct Minloc {
30914b7a73SJunchao Zhang   KOKKOS_INLINE_FUNCTION PairType operator()(PairType& x,PairType y) const {
31914b7a73SJunchao Zhang     PairType old = x;
32914b7a73SJunchao Zhang     if (y.first < x.first) x = y;
33914b7a73SJunchao Zhang     else if (y.first == x.first) x.second = PetscMin(x.second,y.second);
34914b7a73SJunchao Zhang     return old;
35914b7a73SJunchao Zhang   }
36914b7a73SJunchao Zhang };
37914b7a73SJunchao Zhang template<typename PairType> struct Maxloc {
38914b7a73SJunchao Zhang   KOKKOS_INLINE_FUNCTION PairType operator()(PairType& x,PairType y) const {
39914b7a73SJunchao Zhang     PairType old = x;
40914b7a73SJunchao Zhang     if (y.first > x.first) x = y;
41914b7a73SJunchao Zhang     else if (y.first == x.first) x.second = PetscMin(x.second,y.second); /* See MPI MAXLOC */
42914b7a73SJunchao Zhang     return old;
43914b7a73SJunchao Zhang   }
44914b7a73SJunchao Zhang };
45914b7a73SJunchao Zhang 
46914b7a73SJunchao Zhang /*====================================================================================*/
47914b7a73SJunchao Zhang /*                             Atomic operations                            */
48914b7a73SJunchao Zhang /*====================================================================================*/
49914b7a73SJunchao Zhang template<typename Type> struct AtomicInsert  {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_assign(&x,y);}};
50914b7a73SJunchao Zhang template<typename Type> struct AtomicAdd     {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_add(&x,y);}};
51914b7a73SJunchao Zhang template<typename Type> struct AtomicBAND    {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_and(&x,y);}};
52914b7a73SJunchao Zhang template<typename Type> struct AtomicBOR     {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_or (&x,y);}};
53914b7a73SJunchao Zhang template<typename Type> struct AtomicBXOR    {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_fetch_xor(&x,y);}};
54914b7a73SJunchao Zhang template<typename Type> struct AtomicLAND    {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {const Type zero=0,one=~0; Kokkos::atomic_and(&x,y?one:zero);}};
55914b7a73SJunchao Zhang template<typename Type> struct AtomicLOR     {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {const Type zero=0,one=1;  Kokkos::atomic_or (&x,y?one:zero);}};
56914b7a73SJunchao Zhang template<typename Type> struct AtomicMult    {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_fetch_mul(&x,y);}};
57914b7a73SJunchao Zhang template<typename Type> struct AtomicMin     {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_fetch_min(&x,y);}};
58914b7a73SJunchao Zhang template<typename Type> struct AtomicMax     {KOKKOS_INLINE_FUNCTION void operator()(Type& x,Type y) const {Kokkos::atomic_fetch_max(&x,y);}};
59914b7a73SJunchao Zhang /* TODO: struct AtomicLXOR  */
60914b7a73SJunchao Zhang template<typename Type> struct AtomicFetchAdd{KOKKOS_INLINE_FUNCTION Type operator()(Type& x,Type y) const {return Kokkos::atomic_fetch_add(&x,y);}};
61914b7a73SJunchao Zhang 
62914b7a73SJunchao Zhang /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
63914b7a73SJunchao Zhang static KOKKOS_INLINE_FUNCTION PetscInt MapTidToIndex(const PetscInt *opt,PetscInt tid)
64914b7a73SJunchao Zhang {
65914b7a73SJunchao Zhang   PetscInt        i,j,k,m,n,r;
66914b7a73SJunchao Zhang   const PetscInt  *offset,*start,*dx,*dy,*X,*Y;
67914b7a73SJunchao Zhang 
68914b7a73SJunchao Zhang   n      = opt[0];
69914b7a73SJunchao Zhang   offset = opt + 1;
70914b7a73SJunchao Zhang   start  = opt + n + 2;
71914b7a73SJunchao Zhang   dx     = opt + 2*n + 2;
72914b7a73SJunchao Zhang   dy     = opt + 3*n + 2;
73914b7a73SJunchao Zhang   X      = opt + 5*n + 2;
74914b7a73SJunchao Zhang   Y      = opt + 6*n + 2;
75914b7a73SJunchao Zhang   for (r=0; r<n; r++) {if (tid < offset[r+1]) break;}
76914b7a73SJunchao Zhang   m = (tid - offset[r]);
77914b7a73SJunchao Zhang   k = m/(dx[r]*dy[r]);
78914b7a73SJunchao Zhang   j = (m - k*dx[r]*dy[r])/dx[r];
79914b7a73SJunchao Zhang   i = m - k*dx[r]*dy[r] - j*dx[r];
80914b7a73SJunchao Zhang 
81914b7a73SJunchao Zhang   return (start[r] + k*X[r]*Y[r] + j*X[r] + i);
82914b7a73SJunchao Zhang }
83914b7a73SJunchao Zhang 
84914b7a73SJunchao Zhang /*====================================================================================*/
85914b7a73SJunchao Zhang /*  Wrappers for Pack/Unpack/Scatter kernels. Function pointers are stored in 'link'         */
86914b7a73SJunchao Zhang /*====================================================================================*/
87914b7a73SJunchao Zhang 
88914b7a73SJunchao Zhang /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
89914b7a73SJunchao Zhang    <Type> is PetscReal, which is the primitive type we operate on.
90914b7a73SJunchao Zhang    <bs>   is 16, which says <unit> contains 16 primitive types.
91914b7a73SJunchao Zhang    <BS>   is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
92914b7a73SJunchao Zhang    <EQ>   is 0, which is (bs == BS ? 1 : 0)
93914b7a73SJunchao Zhang 
94914b7a73SJunchao Zhang   If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
95914b7a73SJunchao Zhang   For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
96914b7a73SJunchao Zhang */
97914b7a73SJunchao Zhang template<typename Type,PetscInt BS,PetscInt EQ>
98914b7a73SJunchao Zhang static PetscErrorCode Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *data_,void *buf_)
99914b7a73SJunchao Zhang {
100914b7a73SJunchao Zhang   const PetscInt          *iopt = opt ? opt->array : NULL;
101914b7a73SJunchao Zhang   const PetscInt          M = EQ ? 1 : link->bs/BS, MBS=M*BS; /* If EQ, then MBS will be a compile-time const */
102914b7a73SJunchao Zhang   const Type              *data = static_cast<const Type*>(data_);
103914b7a73SJunchao Zhang   Type                    *buf = static_cast<Type*>(buf_);
104f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
105914b7a73SJunchao Zhang 
106914b7a73SJunchao Zhang   PetscFunctionBegin;
107914b7a73SJunchao Zhang   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceExecutionSpace>(exec,0,count),KOKKOS_LAMBDA(PetscInt tid) {
108914b7a73SJunchao Zhang     /* iopt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
109914b7a73SJunchao Zhang        iopt == NULL && idx == NULL ==> the indices are contiguous;
110914b7a73SJunchao Zhang      */
111914b7a73SJunchao Zhang     PetscInt t = (iopt? MapTidToIndex(iopt,tid) : (idx? idx[tid] : start+tid))*MBS;
112914b7a73SJunchao Zhang     PetscInt s = tid*MBS;
113914b7a73SJunchao Zhang     for (int i=0; i<MBS; i++) buf[s+i] = data[t+i];
114914b7a73SJunchao Zhang   });
115914b7a73SJunchao Zhang   PetscFunctionReturn(0);
116914b7a73SJunchao Zhang }
117914b7a73SJunchao Zhang 
118914b7a73SJunchao Zhang template<typename Type,class Op,PetscInt BS,PetscInt EQ>
119914b7a73SJunchao Zhang static PetscErrorCode UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data_,const void *buf_)
120914b7a73SJunchao Zhang {
121914b7a73SJunchao Zhang   Op                      op;
122914b7a73SJunchao Zhang   const PetscInt          *iopt = opt ? opt->array : NULL;
123914b7a73SJunchao Zhang   const PetscInt          M = EQ ? 1 : link->bs/BS, MBS=M*BS;
124914b7a73SJunchao Zhang   Type                    *data = static_cast<Type*>(data_);
125914b7a73SJunchao Zhang   const Type              *buf = static_cast<const Type*>(buf_);
126f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
127914b7a73SJunchao Zhang 
128914b7a73SJunchao Zhang   PetscFunctionBegin;
129914b7a73SJunchao Zhang   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceExecutionSpace>(exec,0,count),KOKKOS_LAMBDA(PetscInt tid) {
130914b7a73SJunchao Zhang     PetscInt t = (iopt? MapTidToIndex(iopt,tid) : (idx? idx[tid] : start+tid))*MBS;
131914b7a73SJunchao Zhang     PetscInt s = tid*MBS;
132914b7a73SJunchao Zhang     for (int i=0; i<MBS; i++) op(data[t+i],buf[s+i]);
133914b7a73SJunchao Zhang   });
134914b7a73SJunchao Zhang   PetscFunctionReturn(0);
135914b7a73SJunchao Zhang }
136914b7a73SJunchao Zhang 
137914b7a73SJunchao Zhang template<typename Type,class Op,PetscInt BS,PetscInt EQ>
138914b7a73SJunchao Zhang static PetscErrorCode FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,void *buf)
139914b7a73SJunchao Zhang {
140914b7a73SJunchao Zhang   Op                      op;
141914b7a73SJunchao Zhang   const PetscInt          *ropt = opt ? opt->array : NULL;
142914b7a73SJunchao Zhang   const PetscInt          M = EQ ? 1 : link->bs/BS, MBS=M*BS;
143914b7a73SJunchao Zhang   Type                    *rootdata = static_cast<Type*>(data),*leafbuf=static_cast<Type*>(buf);
144f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
145914b7a73SJunchao Zhang 
146914b7a73SJunchao Zhang   PetscFunctionBegin;
147914b7a73SJunchao Zhang   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceExecutionSpace>(exec,0,count),KOKKOS_LAMBDA(PetscInt tid) {
148914b7a73SJunchao Zhang     PetscInt r = (ropt? MapTidToIndex(ropt,tid) : (idx? idx[tid] : start+tid))*MBS;
149914b7a73SJunchao Zhang     PetscInt l = tid*MBS;
150914b7a73SJunchao Zhang     for (int i=0; i<MBS; i++) leafbuf[l+i] = op(rootdata[r+i],leafbuf[l+i]);
151914b7a73SJunchao Zhang   });
152914b7a73SJunchao Zhang   PetscFunctionReturn(0);
153914b7a73SJunchao Zhang }
154914b7a73SJunchao Zhang 
155914b7a73SJunchao Zhang template<typename Type,class Op,PetscInt BS,PetscInt EQ>
156914b7a73SJunchao Zhang static PetscErrorCode ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src_,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst_)
157914b7a73SJunchao Zhang {
158914b7a73SJunchao Zhang   PetscInt                srcx=0,srcy=0,srcX=0,srcY=0,dstx=0,dsty=0,dstX=0,dstY=0;
159914b7a73SJunchao Zhang   const PetscInt          M = (EQ) ? 1 : link->bs/BS, MBS=M*BS;
160914b7a73SJunchao Zhang   const Type              *src = static_cast<const Type*>(src_);
161914b7a73SJunchao Zhang   Type                    *dst = static_cast<Type*>(dst_);
162f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
163914b7a73SJunchao Zhang 
164914b7a73SJunchao Zhang   PetscFunctionBegin;
165914b7a73SJunchao Zhang   /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
166914b7a73SJunchao Zhang   if (srcOpt)       {srcx = srcOpt->dx[0]; srcy = srcOpt->dy[0]; srcX = srcOpt->X[0]; srcY = srcOpt->Y[0]; srcStart = srcOpt->start[0]; srcIdx = NULL;}
167914b7a73SJunchao Zhang   else if (!srcIdx) {srcx = srcX = count; srcy = srcY = 1;}
168914b7a73SJunchao Zhang 
169914b7a73SJunchao Zhang   if (dstOpt)       {dstx = dstOpt->dx[0]; dsty = dstOpt->dy[0]; dstX = dstOpt->X[0]; dstY = dstOpt->Y[0]; dstStart = dstOpt->start[0]; dstIdx = NULL;}
170914b7a73SJunchao Zhang   else if (!dstIdx) {dstx = dstX = count; dsty = dstY = 1;}
171914b7a73SJunchao Zhang 
172914b7a73SJunchao Zhang   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceExecutionSpace>(exec,0,count),KOKKOS_LAMBDA(PetscInt tid) {
173914b7a73SJunchao Zhang     PetscInt i,j,k,s,t;
174914b7a73SJunchao Zhang     Op       op;
175914b7a73SJunchao Zhang     if (!srcIdx) { /* src is in 3D */
176914b7a73SJunchao Zhang       k = tid/(srcx*srcy);
177914b7a73SJunchao Zhang       j = (tid - k*srcx*srcy)/srcx;
178914b7a73SJunchao Zhang       i = tid - k*srcx*srcy - j*srcx;
179914b7a73SJunchao Zhang       s = srcStart + k*srcX*srcY + j*srcX + i;
180914b7a73SJunchao Zhang     } else { /* src is contiguous */
181914b7a73SJunchao Zhang       s = srcIdx[tid];
182914b7a73SJunchao Zhang     }
183914b7a73SJunchao Zhang 
184914b7a73SJunchao Zhang     if (!dstIdx) { /* 3D */
185914b7a73SJunchao Zhang       k = tid/(dstx*dsty);
186914b7a73SJunchao Zhang       j = (tid - k*dstx*dsty)/dstx;
187914b7a73SJunchao Zhang       i = tid - k*dstx*dsty - j*dstx;
188914b7a73SJunchao Zhang       t = dstStart + k*dstX*dstY + j*dstX + i;
189914b7a73SJunchao Zhang     } else { /* contiguous */
190914b7a73SJunchao Zhang       t = dstIdx[tid];
191914b7a73SJunchao Zhang     }
192914b7a73SJunchao Zhang 
193914b7a73SJunchao Zhang     s *= MBS;
194914b7a73SJunchao Zhang     t *= MBS;
195914b7a73SJunchao Zhang     for (i=0; i<MBS; i++) op(dst[t+i],src[s+i]);
196914b7a73SJunchao Zhang   });
197914b7a73SJunchao Zhang   PetscFunctionReturn(0);
198914b7a73SJunchao Zhang }
199914b7a73SJunchao Zhang 
200914b7a73SJunchao Zhang /* Specialization for Insert since we may use memcpy */
201914b7a73SJunchao Zhang template<typename Type,PetscInt BS,PetscInt EQ>
202914b7a73SJunchao Zhang static PetscErrorCode ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src_,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst_)
203914b7a73SJunchao Zhang {
204914b7a73SJunchao Zhang   PetscErrorCode          ierr;
205914b7a73SJunchao Zhang   const Type              *src = static_cast<const Type*>(src_);
206914b7a73SJunchao Zhang   Type                    *dst = static_cast<Type*>(dst_);
207f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
208914b7a73SJunchao Zhang 
209914b7a73SJunchao Zhang   PetscFunctionBegin;
210914b7a73SJunchao Zhang   if (!count) PetscFunctionReturn(0);
211914b7a73SJunchao Zhang   /*src and dst are contiguous */
212914b7a73SJunchao Zhang   if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
213914b7a73SJunchao Zhang     size_t sz = count*link->unitbytes;
214914b7a73SJunchao Zhang     deviceBuffer_t      dbuf(reinterpret_cast<char*>(dst+dstStart*link->bs),sz);
215914b7a73SJunchao Zhang     deviceConstBuffer_t sbuf(reinterpret_cast<const char*>(src+srcStart*link->bs),sz);
216914b7a73SJunchao Zhang     Kokkos::deep_copy(exec,dbuf,sbuf);
217914b7a73SJunchao Zhang   } else {
218914b7a73SJunchao Zhang     ierr = ScatterAndOp<Type,Insert<Type>,BS,EQ>(link,count,srcStart,srcOpt,srcIdx,src,dstStart,dstOpt,dstIdx,dst);CHKERRQ(ierr);
219914b7a73SJunchao Zhang   }
220914b7a73SJunchao Zhang   PetscFunctionReturn(0);
221914b7a73SJunchao Zhang }
222914b7a73SJunchao Zhang 
223914b7a73SJunchao Zhang template<typename Type,class Op,PetscInt BS,PetscInt EQ>
224914b7a73SJunchao Zhang static PetscErrorCode FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata_,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata_,void *leafupdate_)
225914b7a73SJunchao Zhang {
226914b7a73SJunchao Zhang   Op                      op;
227914b7a73SJunchao Zhang   const PetscInt          M = (EQ) ? 1 : link->bs/BS, MBS = M*BS;
228914b7a73SJunchao Zhang   const PetscInt          *ropt = rootopt ? rootopt->array : NULL;
229914b7a73SJunchao Zhang   const PetscInt          *lopt = leafopt ? leafopt->array : NULL;
230914b7a73SJunchao Zhang   Type                    *rootdata = static_cast<Type*>(rootdata_),*leafupdate = static_cast<Type*>(leafupdate_);
231914b7a73SJunchao Zhang   const Type              *leafdata = static_cast<const Type*>(leafdata_);
232f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
233914b7a73SJunchao Zhang 
234914b7a73SJunchao Zhang   PetscFunctionBegin;
235914b7a73SJunchao Zhang   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceExecutionSpace>(exec,0,count),KOKKOS_LAMBDA(PetscInt tid) {
236914b7a73SJunchao Zhang     PetscInt r = (ropt? MapTidToIndex(ropt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
237914b7a73SJunchao Zhang     PetscInt l = (lopt? MapTidToIndex(lopt,tid) : (leafidx? leafidx[tid] : leafstart+tid))*MBS;
238914b7a73SJunchao Zhang     for (int i=0; i<MBS; i++) leafupdate[l+i] = op(rootdata[r+i],leafdata[l+i]);
239914b7a73SJunchao Zhang   });
240914b7a73SJunchao Zhang   PetscFunctionReturn(0);
241914b7a73SJunchao Zhang }
242914b7a73SJunchao Zhang 
243914b7a73SJunchao Zhang /*====================================================================================*/
244914b7a73SJunchao Zhang /*  Init various types and instantiate pack/unpack function pointers                  */
245914b7a73SJunchao Zhang /*====================================================================================*/
246914b7a73SJunchao Zhang template<typename Type,PetscInt BS,PetscInt EQ>
247914b7a73SJunchao Zhang static void PackInit_RealType(PetscSFLink link)
248914b7a73SJunchao Zhang {
249914b7a73SJunchao Zhang   /* Pack/unpack for remote communication */
250914b7a73SJunchao Zhang   link->d_Pack              = Pack<Type,BS,EQ>;
251914b7a73SJunchao Zhang   link->d_UnpackAndInsert   = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
252914b7a73SJunchao Zhang   link->d_UnpackAndAdd      = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
253914b7a73SJunchao Zhang   link->d_UnpackAndMult     = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
254914b7a73SJunchao Zhang   link->d_UnpackAndMin      = UnpackAndOp<Type,Min<Type>   ,BS,EQ>;
255914b7a73SJunchao Zhang   link->d_UnpackAndMax      = UnpackAndOp<Type,Max<Type>   ,BS,EQ>;
256914b7a73SJunchao Zhang   link->d_FetchAndAdd       = FetchAndOp <Type,Add<Type>   ,BS,EQ>;
257914b7a73SJunchao Zhang   /* Scatter for local communication */
258914b7a73SJunchao Zhang   link->d_ScatterAndInsert  = ScatterAndInsert<Type,BS,EQ>; /* Has special optimizations */
259914b7a73SJunchao Zhang   link->d_ScatterAndAdd     = ScatterAndOp<Type,Add<Type>    ,BS,EQ>;
260914b7a73SJunchao Zhang   link->d_ScatterAndMult    = ScatterAndOp<Type,Mult<Type>   ,BS,EQ>;
261914b7a73SJunchao Zhang   link->d_ScatterAndMin     = ScatterAndOp<Type,Min<Type>    ,BS,EQ>;
262914b7a73SJunchao Zhang   link->d_ScatterAndMax     = ScatterAndOp<Type,Max<Type>    ,BS,EQ>;
263914b7a73SJunchao Zhang   link->d_FetchAndAddLocal  = FetchAndOpLocal<Type,Add <Type>,BS,EQ>;
264914b7a73SJunchao Zhang   /* Atomic versions when there are data-race possibilities */
265914b7a73SJunchao Zhang   link->da_UnpackAndInsert  = UnpackAndOp<Type,AtomicInsert<Type>  ,BS,EQ>;
266914b7a73SJunchao Zhang   link->da_UnpackAndAdd     = UnpackAndOp<Type,AtomicAdd<Type>     ,BS,EQ>;
267914b7a73SJunchao Zhang   link->da_UnpackAndMult    = UnpackAndOp<Type,AtomicMult<Type>    ,BS,EQ>;
268914b7a73SJunchao Zhang   link->da_UnpackAndMin     = UnpackAndOp<Type,AtomicMin<Type>     ,BS,EQ>;
269914b7a73SJunchao Zhang   link->da_UnpackAndMax     = UnpackAndOp<Type,AtomicMax<Type>     ,BS,EQ>;
270914b7a73SJunchao Zhang   link->da_FetchAndAdd      = FetchAndOp <Type,AtomicFetchAdd<Type>,BS,EQ>;
271914b7a73SJunchao Zhang 
272914b7a73SJunchao Zhang   link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
273914b7a73SJunchao Zhang   link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
274914b7a73SJunchao Zhang   link->da_ScatterAndMult   = ScatterAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
275914b7a73SJunchao Zhang   link->da_ScatterAndMin    = ScatterAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
276914b7a73SJunchao Zhang   link->da_ScatterAndMax    = ScatterAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
277914b7a73SJunchao Zhang   link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicFetchAdd<Type>,BS,EQ>;
278914b7a73SJunchao Zhang }
279914b7a73SJunchao Zhang 
280914b7a73SJunchao Zhang template<typename Type,PetscInt BS,PetscInt EQ>
281914b7a73SJunchao Zhang static void PackInit_IntegerType(PetscSFLink link)
282914b7a73SJunchao Zhang {
283914b7a73SJunchao Zhang   link->d_Pack              = Pack<Type,BS,EQ>;
284914b7a73SJunchao Zhang   link->d_UnpackAndInsert   = UnpackAndOp<Type,Insert<Type> ,BS,EQ>;
285914b7a73SJunchao Zhang   link->d_UnpackAndAdd      = UnpackAndOp<Type,Add<Type>    ,BS,EQ>;
286914b7a73SJunchao Zhang   link->d_UnpackAndMult     = UnpackAndOp<Type,Mult<Type>   ,BS,EQ>;
287914b7a73SJunchao Zhang   link->d_UnpackAndMin      = UnpackAndOp<Type,Min<Type>    ,BS,EQ>;
288914b7a73SJunchao Zhang   link->d_UnpackAndMax      = UnpackAndOp<Type,Max<Type>    ,BS,EQ>;
289914b7a73SJunchao Zhang   link->d_UnpackAndLAND     = UnpackAndOp<Type,LAND<Type>   ,BS,EQ>;
290914b7a73SJunchao Zhang   link->d_UnpackAndLOR      = UnpackAndOp<Type,LOR<Type>    ,BS,EQ>;
291914b7a73SJunchao Zhang   link->d_UnpackAndLXOR     = UnpackAndOp<Type,LXOR<Type>   ,BS,EQ>;
292914b7a73SJunchao Zhang   link->d_UnpackAndBAND     = UnpackAndOp<Type,BAND<Type>   ,BS,EQ>;
293914b7a73SJunchao Zhang   link->d_UnpackAndBOR      = UnpackAndOp<Type,BOR<Type>    ,BS,EQ>;
294914b7a73SJunchao Zhang   link->d_UnpackAndBXOR     = UnpackAndOp<Type,BXOR<Type>   ,BS,EQ>;
295914b7a73SJunchao Zhang   link->d_FetchAndAdd       = FetchAndOp <Type,Add<Type>    ,BS,EQ>;
296914b7a73SJunchao Zhang 
297914b7a73SJunchao Zhang   link->d_ScatterAndInsert  = ScatterAndInsert<Type,BS,EQ>;
298914b7a73SJunchao Zhang   link->d_ScatterAndAdd     = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
299914b7a73SJunchao Zhang   link->d_ScatterAndMult    = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
300914b7a73SJunchao Zhang   link->d_ScatterAndMin     = ScatterAndOp<Type,Min<Type>   ,BS,EQ>;
301914b7a73SJunchao Zhang   link->d_ScatterAndMax     = ScatterAndOp<Type,Max<Type>   ,BS,EQ>;
302914b7a73SJunchao Zhang   link->d_ScatterAndLAND    = ScatterAndOp<Type,LAND<Type>  ,BS,EQ>;
303914b7a73SJunchao Zhang   link->d_ScatterAndLOR     = ScatterAndOp<Type,LOR<Type>   ,BS,EQ>;
304914b7a73SJunchao Zhang   link->d_ScatterAndLXOR    = ScatterAndOp<Type,LXOR<Type>  ,BS,EQ>;
305914b7a73SJunchao Zhang   link->d_ScatterAndBAND    = ScatterAndOp<Type,BAND<Type>  ,BS,EQ>;
306914b7a73SJunchao Zhang   link->d_ScatterAndBOR     = ScatterAndOp<Type,BOR<Type>   ,BS,EQ>;
307914b7a73SJunchao Zhang   link->d_ScatterAndBXOR    = ScatterAndOp<Type,BXOR<Type>  ,BS,EQ>;
308914b7a73SJunchao Zhang   link->d_FetchAndAddLocal  = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
309914b7a73SJunchao Zhang 
310914b7a73SJunchao Zhang   link->da_UnpackAndInsert  = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
311914b7a73SJunchao Zhang   link->da_UnpackAndAdd     = UnpackAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
312914b7a73SJunchao Zhang   link->da_UnpackAndMult    = UnpackAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
313914b7a73SJunchao Zhang   link->da_UnpackAndMin     = UnpackAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
314914b7a73SJunchao Zhang   link->da_UnpackAndMax     = UnpackAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
315914b7a73SJunchao Zhang   link->da_UnpackAndLAND    = UnpackAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
316914b7a73SJunchao Zhang   link->da_UnpackAndLOR     = UnpackAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
317914b7a73SJunchao Zhang   link->da_UnpackAndBAND    = UnpackAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
318914b7a73SJunchao Zhang   link->da_UnpackAndBOR     = UnpackAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
319914b7a73SJunchao Zhang   link->da_UnpackAndBXOR    = UnpackAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
320914b7a73SJunchao Zhang   link->da_FetchAndAdd      = FetchAndOp <Type,AtomicFetchAdd<Type>,BS,EQ>;
321914b7a73SJunchao Zhang 
322914b7a73SJunchao Zhang   link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
323914b7a73SJunchao Zhang   link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
324914b7a73SJunchao Zhang   link->da_ScatterAndMult   = ScatterAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
325914b7a73SJunchao Zhang   link->da_ScatterAndMin    = ScatterAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
326914b7a73SJunchao Zhang   link->da_ScatterAndMax    = ScatterAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
327914b7a73SJunchao Zhang   link->da_ScatterAndLAND   = ScatterAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
328914b7a73SJunchao Zhang   link->da_ScatterAndLOR    = ScatterAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
329914b7a73SJunchao Zhang   link->da_ScatterAndBAND   = ScatterAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
330914b7a73SJunchao Zhang   link->da_ScatterAndBOR    = ScatterAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
331914b7a73SJunchao Zhang   link->da_ScatterAndBXOR   = ScatterAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
332914b7a73SJunchao Zhang   link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicFetchAdd<Type>,BS,EQ>;
333914b7a73SJunchao Zhang }
334914b7a73SJunchao Zhang 
335914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
336914b7a73SJunchao Zhang template<typename Type,PetscInt BS,PetscInt EQ>
337914b7a73SJunchao Zhang static void PackInit_ComplexType(PetscSFLink link)
338914b7a73SJunchao Zhang {
339914b7a73SJunchao Zhang   link->d_Pack             = Pack<Type,BS,EQ>;
340914b7a73SJunchao Zhang   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
341914b7a73SJunchao Zhang   link->d_UnpackAndAdd     = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
342914b7a73SJunchao Zhang   link->d_UnpackAndMult    = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
343914b7a73SJunchao Zhang   link->d_FetchAndAdd      = FetchAndOp <Type,Add<Type>   ,BS,EQ>;
344914b7a73SJunchao Zhang 
345914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
346914b7a73SJunchao Zhang   link->d_ScatterAndAdd    = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
347914b7a73SJunchao Zhang   link->d_ScatterAndMult   = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
348914b7a73SJunchao Zhang   link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
349914b7a73SJunchao Zhang 
350914b7a73SJunchao Zhang   link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type> ,BS,EQ>;
351914b7a73SJunchao Zhang   link->da_UnpackAndAdd    = UnpackAndOp<Type,AtomicAdd<Type>    ,BS,EQ>;
352914b7a73SJunchao Zhang   link->da_UnpackAndMult   = UnpackAndOp<Type,AtomicMult<Type>   ,BS,EQ>;
353914b7a73SJunchao Zhang   link->da_FetchAndAdd     = FetchAndOp<Type,AtomicFetchAdd<Type>,BS,EQ>;
354914b7a73SJunchao Zhang 
355914b7a73SJunchao Zhang   link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
356914b7a73SJunchao Zhang   link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
357914b7a73SJunchao Zhang   link->da_ScatterAndMult   = ScatterAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
358914b7a73SJunchao Zhang   link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicFetchAdd<Type>,BS,EQ>;
359914b7a73SJunchao Zhang }
360914b7a73SJunchao Zhang #endif
361914b7a73SJunchao Zhang 
362914b7a73SJunchao Zhang template<typename Type>
363914b7a73SJunchao Zhang static void PackInit_PairType(PetscSFLink link)
364914b7a73SJunchao Zhang {
365914b7a73SJunchao Zhang   link->d_Pack             = Pack<Type,1,1>;
366914b7a73SJunchao Zhang   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,1,1>;
367914b7a73SJunchao Zhang   link->d_UnpackAndMaxloc  = UnpackAndOp<Type,Maxloc<Type>,1,1>;
368914b7a73SJunchao Zhang   link->d_UnpackAndMinloc  = UnpackAndOp<Type,Minloc<Type>,1,1>;
369914b7a73SJunchao Zhang 
370914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndOp<Type,Insert<Type>,1,1>;
371914b7a73SJunchao Zhang   link->d_ScatterAndMaxloc = ScatterAndOp<Type,Maxloc<Type>,1,1>;
372914b7a73SJunchao Zhang   link->d_ScatterAndMinloc = ScatterAndOp<Type,Minloc<Type>,1,1>;
373914b7a73SJunchao Zhang   /* Atomics for pair types are not implemented yet */
374914b7a73SJunchao Zhang }
375914b7a73SJunchao Zhang 
376914b7a73SJunchao Zhang template<typename Type,PetscInt BS,PetscInt EQ>
377914b7a73SJunchao Zhang static void PackInit_DumbType(PetscSFLink link)
378914b7a73SJunchao Zhang {
379914b7a73SJunchao Zhang   link->d_Pack             = Pack<Type,BS,EQ>;
380914b7a73SJunchao Zhang   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
381914b7a73SJunchao Zhang   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
382914b7a73SJunchao Zhang   /* Atomics for dumb types are not implemented yet */
383914b7a73SJunchao Zhang }
384914b7a73SJunchao Zhang 
385f4af43b4SJunchao Zhang /*
386f4af43b4SJunchao Zhang   Kokkos::DefaultExecutionSpace(stream) is a reference counted pointer object. It has a bug
387f4af43b4SJunchao Zhang   that one is not able to repeatedly create and destroy the object. SF's original design was each
388f4af43b4SJunchao Zhang   SFLink has a stream (NULL or not) and hence an execution space object. The bug prevents us from
389f4af43b4SJunchao Zhang   destroying multiple SFLinks with NULL stream and the default execution space object. To avoid
390f4af43b4SJunchao Zhang   memory leaks, SF_Kokkos only supports NULL stream, which is also petsc's default scheme. SF_Kokkos
391f4af43b4SJunchao Zhang   does not do its own new/delete. It just uses Kokkos::DefaultExecutionSpace(), which is a singliton
392f4af43b4SJunchao Zhang   object in Kokkos.
393f4af43b4SJunchao Zhang */
394f4af43b4SJunchao Zhang /*
395914b7a73SJunchao Zhang static PetscErrorCode PetscSFLinkDestroy_Kokkos(PetscSFLink link)
396914b7a73SJunchao Zhang {
397914b7a73SJunchao Zhang   PetscFunctionBegin;
398914b7a73SJunchao Zhang   PetscFunctionReturn(0);
399914b7a73SJunchao Zhang }
400f4af43b4SJunchao Zhang */
401914b7a73SJunchao Zhang 
402914b7a73SJunchao Zhang /* Some device-specific utilities */
403a4af0ceeSJacob Faibussowitsch static PetscErrorCode PetscSFLinkSyncDevice_Kokkos(PetscSFLink PETSC_UNUSED link)
404914b7a73SJunchao Zhang {
405914b7a73SJunchao Zhang   PetscFunctionBegin;
406914b7a73SJunchao Zhang   Kokkos::fence();
407914b7a73SJunchao Zhang   PetscFunctionReturn(0);
408914b7a73SJunchao Zhang }
409914b7a73SJunchao Zhang 
410a4af0ceeSJacob Faibussowitsch static PetscErrorCode PetscSFLinkSyncStream_Kokkos(PetscSFLink PETSC_UNUSED link)
411914b7a73SJunchao Zhang {
412f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
413914b7a73SJunchao Zhang   PetscFunctionBegin;
414914b7a73SJunchao Zhang   exec.fence();
415914b7a73SJunchao Zhang   PetscFunctionReturn(0);
416914b7a73SJunchao Zhang }
417914b7a73SJunchao Zhang 
418a4af0ceeSJacob Faibussowitsch static PetscErrorCode PetscSFLinkMemcpy_Kokkos(PetscSFLink PETSC_UNUSED link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
419914b7a73SJunchao Zhang {
420f4af43b4SJunchao Zhang   DeviceExecutionSpace    exec;
421914b7a73SJunchao Zhang 
422914b7a73SJunchao Zhang   PetscFunctionBegin;
423914b7a73SJunchao Zhang   if (!n) PetscFunctionReturn(0);
42471438e86SJunchao Zhang   if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) {
425914b7a73SJunchao Zhang     PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);
426914b7a73SJunchao Zhang   } else {
42771438e86SJunchao Zhang     if (PetscMemTypeDevice(dstmtype) && PetscMemTypeHost(srcmtype)) {
428914b7a73SJunchao Zhang       deviceBuffer_t       dbuf(static_cast<char*>(dst),n);
429914b7a73SJunchao Zhang       HostConstBuffer_t    sbuf(static_cast<const char*>(src),n);
430914b7a73SJunchao Zhang       Kokkos::deep_copy(exec,dbuf,sbuf);
431914b7a73SJunchao Zhang       PetscErrorCode ierr = PetscLogCpuToGpu(n);CHKERRQ(ierr);
43271438e86SJunchao Zhang     } else if (PetscMemTypeHost(dstmtype) && PetscMemTypeDevice(srcmtype)) {
433914b7a73SJunchao Zhang       HostBuffer_t         dbuf(static_cast<char*>(dst),n);
434914b7a73SJunchao Zhang       deviceConstBuffer_t  sbuf(static_cast<const char*>(src),n);
435914b7a73SJunchao Zhang       Kokkos::deep_copy(exec,dbuf,sbuf);
436914b7a73SJunchao Zhang       PetscErrorCode ierr = PetscLogGpuToCpu(n);CHKERRQ(ierr);
43771438e86SJunchao Zhang     } else if (PetscMemTypeDevice(dstmtype) && PetscMemTypeDevice(srcmtype)) {
438914b7a73SJunchao Zhang       deviceBuffer_t       dbuf(static_cast<char*>(dst),n);
439914b7a73SJunchao Zhang       deviceConstBuffer_t  sbuf(static_cast<const char*>(src),n);
440914b7a73SJunchao Zhang       Kokkos::deep_copy(exec,dbuf,sbuf);
441914b7a73SJunchao Zhang     }
442914b7a73SJunchao Zhang   }
443914b7a73SJunchao Zhang   PetscFunctionReturn(0);
444914b7a73SJunchao Zhang }
445914b7a73SJunchao Zhang 
44620c24465SJunchao Zhang PetscErrorCode PetscSFMalloc_Kokkos(PetscMemType mtype,size_t size,void** ptr)
447914b7a73SJunchao Zhang {
448914b7a73SJunchao Zhang   PetscFunctionBegin;
44971438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {PetscErrorCode ierr = PetscMalloc(size,ptr);CHKERRQ(ierr);}
45071438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype)) {
45145639126SStefano Zampini     if (!PetscKokkosInitialized) { PetscErrorCode ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); }
45245639126SStefano Zampini     *ptr = Kokkos::kokkos_malloc<DeviceMemorySpace>(size);
45371438e86SJunchao Zhang   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d", (int)mtype);
454914b7a73SJunchao Zhang   PetscFunctionReturn(0);
455914b7a73SJunchao Zhang }
456914b7a73SJunchao Zhang 
45720c24465SJunchao Zhang PetscErrorCode PetscSFFree_Kokkos(PetscMemType mtype,void* ptr)
458914b7a73SJunchao Zhang {
459914b7a73SJunchao Zhang   PetscFunctionBegin;
46071438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {PetscErrorCode ierr = PetscFree(ptr);CHKERRQ(ierr);}
46171438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype)) {Kokkos::kokkos_free<DeviceMemorySpace>(ptr);}
462914b7a73SJunchao Zhang   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d",(int)mtype);
463914b7a73SJunchao Zhang   PetscFunctionReturn(0);
464914b7a73SJunchao Zhang }
465914b7a73SJunchao Zhang 
46671438e86SJunchao Zhang /* Destructor when the link uses MPI for communication */
46771438e86SJunchao Zhang static PetscErrorCode PetscSFLinkDestroy_Kokkos(PetscSF sf,PetscSFLink link)
46871438e86SJunchao Zhang {
46971438e86SJunchao Zhang   PetscErrorCode     ierr;
47071438e86SJunchao Zhang 
47171438e86SJunchao Zhang   PetscFunctionBegin;
47271438e86SJunchao Zhang   for (int i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
47371438e86SJunchao Zhang     ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
47471438e86SJunchao Zhang     ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr);
47571438e86SJunchao Zhang   }
47671438e86SJunchao Zhang   PetscFunctionReturn(0);
47771438e86SJunchao Zhang }
478914b7a73SJunchao Zhang 
479914b7a73SJunchao Zhang /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
480a4af0ceeSJacob Faibussowitsch PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF PETSC_UNUSED sf,PetscSFLink link,MPI_Datatype unit)
481914b7a73SJunchao Zhang {
482914b7a73SJunchao Zhang   PetscErrorCode     ierr;
483914b7a73SJunchao Zhang   PetscInt           nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
484914b7a73SJunchao Zhang   PetscBool          is2Int,is2PetscInt;
485914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
486914b7a73SJunchao Zhang   PetscInt           nPetscComplex=0;
487914b7a73SJunchao Zhang #endif
488914b7a73SJunchao Zhang 
489914b7a73SJunchao Zhang   PetscFunctionBegin;
490914b7a73SJunchao Zhang   if (link->deviceinited) PetscFunctionReturn(0);
49159e55d94SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
492914b7a73SJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);CHKERRQ(ierr);
493914b7a73SJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr);
494914b7a73SJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
495914b7a73SJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);CHKERRQ(ierr);
496914b7a73SJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr);
497914b7a73SJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr);
498914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
499914b7a73SJunchao Zhang   ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr);
500914b7a73SJunchao Zhang #endif
501914b7a73SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr);
502914b7a73SJunchao Zhang   ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr);
503914b7a73SJunchao Zhang 
504914b7a73SJunchao Zhang   if (is2Int) {
505914b7a73SJunchao Zhang     PackInit_PairType<Kokkos::pair<int,int>>(link);
506914b7a73SJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
507914b7a73SJunchao Zhang     PackInit_PairType<Kokkos::pair<PetscInt,PetscInt>>(link);
508914b7a73SJunchao Zhang   } else if (nPetscReal) {
509*eee4e20aSJunchao Zhang    #if !defined(PETSC_HAVE_SYCL)  /* Skip the unimportant stuff to speed up dpcpp's compilation time */
510914b7a73SJunchao Zhang     if      (nPetscReal == 8) PackInit_RealType<PetscReal,8,1>(link); else if (nPetscReal%8 == 0) PackInit_RealType<PetscReal,8,0>(link);
511914b7a73SJunchao Zhang     else if (nPetscReal == 4) PackInit_RealType<PetscReal,4,1>(link); else if (nPetscReal%4 == 0) PackInit_RealType<PetscReal,4,0>(link);
512914b7a73SJunchao Zhang     else if (nPetscReal == 2) PackInit_RealType<PetscReal,2,1>(link); else if (nPetscReal%2 == 0) PackInit_RealType<PetscReal,2,0>(link);
513*eee4e20aSJunchao Zhang     else
514*eee4e20aSJunchao Zhang    #endif
515*eee4e20aSJunchao Zhang     if (nPetscReal == 1) PackInit_RealType<PetscReal,1,1>(link); else if (nPetscReal%1 == 0) PackInit_RealType<PetscReal,1,0>(link);
516874d28e3SJunchao Zhang   } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
517*eee4e20aSJunchao Zhang    #if !defined(PETSC_HAVE_SYCL)
518874d28e3SJunchao Zhang     if      (nPetscInt == 8) PackInit_IntegerType<llint,8,1>(link); else if (nPetscInt%8 == 0) PackInit_IntegerType<llint,8,0>(link);
519874d28e3SJunchao Zhang     else if (nPetscInt == 4) PackInit_IntegerType<llint,4,1>(link); else if (nPetscInt%4 == 0) PackInit_IntegerType<llint,4,0>(link);
520874d28e3SJunchao Zhang     else if (nPetscInt == 2) PackInit_IntegerType<llint,2,1>(link); else if (nPetscInt%2 == 0) PackInit_IntegerType<llint,2,0>(link);
521*eee4e20aSJunchao Zhang     else
522*eee4e20aSJunchao Zhang    #endif
523*eee4e20aSJunchao Zhang     if (nPetscInt == 1) PackInit_IntegerType<llint,1,1>(link); else if (nPetscInt%1 == 0) PackInit_IntegerType<llint,1,0>(link);
524914b7a73SJunchao Zhang   } else if (nInt) {
525*eee4e20aSJunchao Zhang     #if !defined(PETSC_HAVE_SYCL)
526914b7a73SJunchao Zhang     if      (nInt == 8) PackInit_IntegerType<int,8,1>(link); else if (nInt%8 == 0) PackInit_IntegerType<int,8,0>(link);
527914b7a73SJunchao Zhang     else if (nInt == 4) PackInit_IntegerType<int,4,1>(link); else if (nInt%4 == 0) PackInit_IntegerType<int,4,0>(link);
528914b7a73SJunchao Zhang     else if (nInt == 2) PackInit_IntegerType<int,2,1>(link); else if (nInt%2 == 0) PackInit_IntegerType<int,2,0>(link);
529*eee4e20aSJunchao Zhang     else
530*eee4e20aSJunchao Zhang    #endif
531*eee4e20aSJunchao Zhang     if (nInt == 1) PackInit_IntegerType<int,1,1>(link); else if (nInt%1 == 0) PackInit_IntegerType<int,1,0>(link);
532914b7a73SJunchao Zhang   } else if (nSignedChar) {
533*eee4e20aSJunchao Zhang    #if !defined(PETSC_HAVE_SYCL)
534914b7a73SJunchao Zhang     if      (nSignedChar == 8) PackInit_IntegerType<char,8,1>(link); else if (nSignedChar%8 == 0) PackInit_IntegerType<char,8,0>(link);
535914b7a73SJunchao Zhang     else if (nSignedChar == 4) PackInit_IntegerType<char,4,1>(link); else if (nSignedChar%4 == 0) PackInit_IntegerType<char,4,0>(link);
536914b7a73SJunchao Zhang     else if (nSignedChar == 2) PackInit_IntegerType<char,2,1>(link); else if (nSignedChar%2 == 0) PackInit_IntegerType<char,2,0>(link);
537*eee4e20aSJunchao Zhang     else
538*eee4e20aSJunchao Zhang    #endif
539*eee4e20aSJunchao Zhang     if (nSignedChar == 1) PackInit_IntegerType<char,1,1>(link); else if (nSignedChar%1 == 0) PackInit_IntegerType<char,1,0>(link);
540914b7a73SJunchao Zhang   }  else if (nUnsignedChar) {
541*eee4e20aSJunchao Zhang    #if !defined(PETSC_HAVE_SYCL)
542914b7a73SJunchao Zhang     if      (nUnsignedChar == 8) PackInit_IntegerType<unsigned char,8,1>(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType<unsigned char,8,0>(link);
543914b7a73SJunchao Zhang     else if (nUnsignedChar == 4) PackInit_IntegerType<unsigned char,4,1>(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType<unsigned char,4,0>(link);
544914b7a73SJunchao Zhang     else if (nUnsignedChar == 2) PackInit_IntegerType<unsigned char,2,1>(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType<unsigned char,2,0>(link);
545*eee4e20aSJunchao Zhang     else
546*eee4e20aSJunchao Zhang    #endif
547*eee4e20aSJunchao Zhang     if (nUnsignedChar == 1) PackInit_IntegerType<unsigned char,1,1>(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType<unsigned char,1,0>(link);
548914b7a73SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
549914b7a73SJunchao Zhang   } else if (nPetscComplex) {
550*eee4e20aSJunchao Zhang    #if !defined(PETSC_HAVE_SYCL)
551914b7a73SJunchao Zhang     if      (nPetscComplex == 8) PackInit_ComplexType<Kokkos::complex<PetscReal>,8,1>(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>,8,0>(link);
552914b7a73SJunchao Zhang     else if (nPetscComplex == 4) PackInit_ComplexType<Kokkos::complex<PetscReal>,4,1>(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>,4,0>(link);
553914b7a73SJunchao Zhang     else if (nPetscComplex == 2) PackInit_ComplexType<Kokkos::complex<PetscReal>,2,1>(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>,2,0>(link);
554*eee4e20aSJunchao Zhang     else
555*eee4e20aSJunchao Zhang    #endif
556*eee4e20aSJunchao Zhang     if (nPetscComplex == 1) PackInit_ComplexType<Kokkos::complex<PetscReal>,1,1>(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType<Kokkos::complex<PetscReal>,1,0>(link);
557914b7a73SJunchao Zhang #endif
558914b7a73SJunchao Zhang   } else {
559914b7a73SJunchao Zhang     MPI_Aint lb,nbyte;
560ffc4695bSBarry Smith     ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr);
561914b7a73SJunchao Zhang     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
562914b7a73SJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
563*eee4e20aSJunchao Zhang      #if !defined(PETSC_HAVE_SYCL)
564914b7a73SJunchao Zhang       if      (nbyte == 4) PackInit_DumbType<char,4,1>(link); else if (nbyte%4 == 0) PackInit_DumbType<char,4,0>(link);
565914b7a73SJunchao Zhang       else if (nbyte == 2) PackInit_DumbType<char,2,1>(link); else if (nbyte%2 == 0) PackInit_DumbType<char,2,0>(link);
566*eee4e20aSJunchao Zhang       else
567*eee4e20aSJunchao Zhang      #endif
568*eee4e20aSJunchao Zhang       if (nbyte == 1) PackInit_DumbType<char,1,1>(link); else if (nbyte%1 == 0) PackInit_DumbType<char,1,0>(link);
569914b7a73SJunchao Zhang     } else {
570914b7a73SJunchao Zhang       nInt = nbyte / sizeof(int);
571*eee4e20aSJunchao Zhang      #if !defined(PETSC_HAVE_SYCL)
572914b7a73SJunchao Zhang       if      (nInt == 8) PackInit_DumbType<int,8,1>(link); else if (nInt%8 == 0) PackInit_DumbType<int,8,0>(link);
573914b7a73SJunchao Zhang       else if (nInt == 4) PackInit_DumbType<int,4,1>(link); else if (nInt%4 == 0) PackInit_DumbType<int,4,0>(link);
574914b7a73SJunchao Zhang       else if (nInt == 2) PackInit_DumbType<int,2,1>(link); else if (nInt%2 == 0) PackInit_DumbType<int,2,0>(link);
575*eee4e20aSJunchao Zhang       else
576*eee4e20aSJunchao Zhang      #endif
577*eee4e20aSJunchao Zhang       if (nInt == 1) PackInit_DumbType<int,1,1>(link); else if (nInt%1 == 0) PackInit_DumbType<int,1,0>(link);
578914b7a73SJunchao Zhang     }
579914b7a73SJunchao Zhang   }
580914b7a73SJunchao Zhang 
58171438e86SJunchao Zhang   link->SyncDevice   = PetscSFLinkSyncDevice_Kokkos;
58271438e86SJunchao Zhang   link->SyncStream   = PetscSFLinkSyncStream_Kokkos;
58320c24465SJunchao Zhang   link->Memcpy       = PetscSFLinkMemcpy_Kokkos;
58471438e86SJunchao Zhang   link->Destroy      = PetscSFLinkDestroy_Kokkos;
585914b7a73SJunchao Zhang   link->deviceinited = PETSC_TRUE;
586914b7a73SJunchao Zhang   PetscFunctionReturn(0);
587914b7a73SJunchao Zhang }
588