xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 503c0ea9b45bcfbcebbb1ea5341243bbc69f0bea)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   Mat_SeqAIJ       *aijseq;
29   Mat_SeqAIJKokkos *aijkok;
30 
31   PetscFunctionBegin;
32   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
33   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
34 
35   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
36   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
37 
38   /* If aijkok does not exist, we just copy i, j to device.
39      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
40      In both cases, we build a new aijkok structure.
41   */
42   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
43     delete aijkok;
44     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
45     A->spptr = aijkok;
46   }
47 
48   if (aijkok && aijkok->device_mat_d.data()) {
49     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
50   }
51   PetscFunctionReturn(0);
52 }
53 
54 /* Sync CSR data to device if not yet */
55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
56 {
57   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
58 
59   PetscFunctionBegin;
60   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
61   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
62   if (aijkok->a_dual.need_sync_device()) {
63     aijkok->a_dual.sync_device();
64     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
65     aijkok->hermitian_updated = PETSC_FALSE;
66   }
67   PetscFunctionReturn(0);
68 }
69 
70 /* Mark the CSR data on device as modified */
71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
72 {
73   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
74 
75   PetscFunctionBegin;
76   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
77   aijkok->a_dual.clear_sync_state();
78   aijkok->a_dual.modify_device();
79   aijkok->transpose_updated = PETSC_FALSE;
80   aijkok->hermitian_updated = PETSC_FALSE;
81   PetscCall(MatSeqAIJInvalidateDiagonal(A));
82   PetscCall(PetscObjectStateIncrease((PetscObject)A));
83   PetscFunctionReturn(0);
84 }
85 
86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
87 {
88   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
89 
90   PetscFunctionBegin;
91   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
92   /* We do not expect one needs factors on host  */
93   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
94   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
95   aijkok->a_dual.sync_host();
96   PetscFunctionReturn(0);
97 }
98 
99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
100 {
101   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
102 
103   PetscFunctionBegin;
104   if (aijkok) {
105     aijkok->a_dual.sync_host();
106     *array = aijkok->a_dual.view_host().data();
107   } else { /* Happens when calling MatSetValues on a newly created matrix */
108     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
109   }
110   PetscFunctionReturn(0);
111 }
112 
113 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
114 {
115   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
116 
117   PetscFunctionBegin;
118   if (aijkok) aijkok->a_dual.modify_host();
119   PetscFunctionReturn(0);
120 }
121 
122 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
123 {
124   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
125 
126   PetscFunctionBegin;
127   if (aijkok) {
128     aijkok->a_dual.sync_host();
129     *array = aijkok->a_dual.view_host().data();
130   } else {
131     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
132   }
133   PetscFunctionReturn(0);
134 }
135 
136 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
137 {
138   PetscFunctionBegin;
139   *array = NULL;
140   PetscFunctionReturn(0);
141 }
142 
143 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
144 {
145   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
146 
147   PetscFunctionBegin;
148   if (aijkok) {
149     *array = aijkok->a_dual.view_host().data();
150   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
151     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
152   }
153   PetscFunctionReturn(0);
154 }
155 
156 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
157 {
158   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
159 
160   PetscFunctionBegin;
161   if (aijkok) {
162     aijkok->a_dual.clear_sync_state();
163     aijkok->a_dual.modify_host();
164   }
165   PetscFunctionReturn(0);
166 }
167 
168 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
169 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
170 {
171   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
172   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
173 
174   PetscFunctionBegin;
175   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
176   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
177   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
178   PetscFunctionReturn(0);
179 }
180 
181 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
182 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
183 {
184   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
185 
186   PetscFunctionBegin;
187   if (aijkok && aijkok->device_mat_d.data()) {
188     *d_mat = aijkok->device_mat_d.data();
189   } else {
190     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
191     *d_mat  = NULL;
192   }
193   PetscFunctionReturn(0);
194 }
195 
196 /* Generate the transpose on device and cache it internally */
197 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
198 {
199   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
200 
201   PetscFunctionBegin;
202   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
203   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
204     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
205     PetscCallCXX(aijkok->a_dual.sync_device());
206     PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
207     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
208     aijkok->transpose_updated = PETSC_TRUE;
209   }
210   *csrmatT = &aijkok->csrmatT;
211   PetscFunctionReturn(0);
212 }
213 
214 /* Generate the Hermitian on device and cache it internally */
215 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
216 {
217   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
218 
219   PetscFunctionBegin;
220   PetscCall(PetscLogGpuTimeBegin());
221   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
222   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
223     PetscCallCXX(aijkok->a_dual.sync_device());
224     PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
225     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
226    #if defined(PETSC_USE_COMPLEX)
227     const auto& a = aijkok->csrmatH.values;
228     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
229    #endif
230     aijkok->hermitian_updated = PETSC_TRUE;
231   }
232   *csrmatH = &aijkok->csrmatH;
233   PetscCall(PetscLogGpuTimeEnd());
234   PetscFunctionReturn(0);
235 }
236 
237 /* y = A x */
238 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
239 {
240   Mat_SeqAIJKokkos                 *aijkok;
241   ConstPetscScalarKokkosView       xv;
242   PetscScalarKokkosView            yv;
243 
244   PetscFunctionBegin;
245   PetscCall(PetscLogGpuTimeBegin());
246   PetscCall(MatSeqAIJKokkosSyncDevice(A));
247   PetscCall(VecGetKokkosView(xx,&xv));
248   PetscCall(VecGetKokkosViewWrite(yy,&yv));
249   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
250   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
251   PetscCall(VecRestoreKokkosView(xx,&xv));
252   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
253   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
254   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
255   PetscCall(PetscLogGpuTimeEnd());
256   PetscFunctionReturn(0);
257 }
258 
259 /* y = A^T x */
260 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
261 {
262   Mat_SeqAIJKokkos                 *aijkok;
263   const char                       *mode;
264   ConstPetscScalarKokkosView       xv;
265   PetscScalarKokkosView            yv;
266   KokkosCsrMatrix                  *csrmat;
267 
268   PetscFunctionBegin;
269   PetscCall(PetscLogGpuTimeBegin());
270   PetscCall(MatSeqAIJKokkosSyncDevice(A));
271   PetscCall(VecGetKokkosView(xx,&xv));
272   PetscCall(VecGetKokkosViewWrite(yy,&yv));
273   if (A->form_explicit_transpose) {
274     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
275     mode = "N";
276   } else {
277     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
278     csrmat = &aijkok->csrmat;
279     mode = "T";
280   }
281   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
282   PetscCall(VecRestoreKokkosView(xx,&xv));
283   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
284   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
285   PetscCall(PetscLogGpuTimeEnd());
286   PetscFunctionReturn(0);
287 }
288 
289 /* y = A^H x */
290 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
291 {
292   Mat_SeqAIJKokkos                 *aijkok;
293   const char                       *mode;
294   ConstPetscScalarKokkosView       xv;
295   PetscScalarKokkosView            yv;
296   KokkosCsrMatrix                  *csrmat;
297 
298   PetscFunctionBegin;
299   PetscCall(PetscLogGpuTimeBegin());
300   PetscCall(MatSeqAIJKokkosSyncDevice(A));
301   PetscCall(VecGetKokkosView(xx,&xv));
302   PetscCall(VecGetKokkosViewWrite(yy,&yv));
303   if (A->form_explicit_transpose) {
304     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
305     mode = "N";
306   } else {
307     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
308     csrmat = &aijkok->csrmat;
309     mode = "C";
310   }
311   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
312   PetscCall(VecRestoreKokkosView(xx,&xv));
313   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
314   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
315   PetscCall(PetscLogGpuTimeEnd());
316   PetscFunctionReturn(0);
317 }
318 
319 /* z = A x + y */
320 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
321 {
322   Mat_SeqAIJKokkos                 *aijkok;
323   ConstPetscScalarKokkosView       xv,yv;
324   PetscScalarKokkosView            zv;
325 
326   PetscFunctionBegin;
327   PetscCall(PetscLogGpuTimeBegin());
328   PetscCall(MatSeqAIJKokkosSyncDevice(A));
329   PetscCall(VecGetKokkosView(xx,&xv));
330   PetscCall(VecGetKokkosView(yy,&yv));
331   PetscCall(VecGetKokkosViewWrite(zz,&zv));
332   if (zz != yy) Kokkos::deep_copy(zv,yv);
333   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
334   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
335   PetscCall(VecRestoreKokkosView(xx,&xv));
336   PetscCall(VecRestoreKokkosView(yy,&yv));
337   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
338   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
339   PetscCall(PetscLogGpuTimeEnd());
340   PetscFunctionReturn(0);
341 }
342 
343 /* z = A^T x + y */
344 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
345 {
346   Mat_SeqAIJKokkos                 *aijkok;
347   const char                       *mode;
348   ConstPetscScalarKokkosView       xv,yv;
349   PetscScalarKokkosView            zv;
350   KokkosCsrMatrix                  *csrmat;
351 
352   PetscFunctionBegin;
353   PetscCall(PetscLogGpuTimeBegin());
354   PetscCall(MatSeqAIJKokkosSyncDevice(A));
355   PetscCall(VecGetKokkosView(xx,&xv));
356   PetscCall(VecGetKokkosView(yy,&yv));
357   PetscCall(VecGetKokkosViewWrite(zz,&zv));
358   if (zz != yy) Kokkos::deep_copy(zv,yv);
359   if (A->form_explicit_transpose) {
360     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
361     mode = "N";
362   } else {
363     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
364     csrmat = &aijkok->csrmat;
365     mode = "T";
366   }
367   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
368   PetscCall(VecRestoreKokkosView(xx,&xv));
369   PetscCall(VecRestoreKokkosView(yy,&yv));
370   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
371   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
372   PetscCall(PetscLogGpuTimeEnd());
373   PetscFunctionReturn(0);
374 }
375 
376 /* z = A^H x + y */
377 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
378 {
379   Mat_SeqAIJKokkos                 *aijkok;
380   const char                       *mode;
381   ConstPetscScalarKokkosView       xv,yv;
382   PetscScalarKokkosView            zv;
383   KokkosCsrMatrix                  *csrmat;
384 
385   PetscFunctionBegin;
386   PetscCall(PetscLogGpuTimeBegin());
387   PetscCall(MatSeqAIJKokkosSyncDevice(A));
388   PetscCall(VecGetKokkosView(xx,&xv));
389   PetscCall(VecGetKokkosView(yy,&yv));
390   PetscCall(VecGetKokkosViewWrite(zz,&zv));
391   if (zz != yy) Kokkos::deep_copy(zv,yv);
392   if (A->form_explicit_transpose) {
393     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
394     mode = "N";
395   } else {
396     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
397     csrmat = &aijkok->csrmat;
398     mode = "C";
399   }
400   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
401   PetscCall(VecRestoreKokkosView(xx,&xv));
402   PetscCall(VecRestoreKokkosView(yy,&yv));
403   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
404   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
405   PetscCall(PetscLogGpuTimeEnd());
406   PetscFunctionReturn(0);
407 }
408 
409 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
410 {
411   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
412 
413   PetscFunctionBegin;
414   switch (op) {
415     case MAT_FORM_EXPLICIT_TRANSPOSE:
416       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
417       if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
418       A->form_explicit_transpose = flg;
419       break;
420     default:
421       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
422       break;
423   }
424   PetscFunctionReturn(0);
425 }
426 
427 /* Depending on reuse, either build a new mat, or use the existing mat */
428 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
429 {
430   Mat_SeqAIJ       *aseq;
431 
432   PetscFunctionBegin;
433   PetscCall(PetscKokkosInitializeCheck());
434   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
435     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
436   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
437     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
438   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
439     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
440     PetscCall(PetscFree(A->defaultvectype));
441     PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
442     PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
443     PetscCall(MatSetOps_SeqAIJKokkos(A));
444     aseq = static_cast<Mat_SeqAIJ*>(A->data);
445     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
446       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
447       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
448     }
449   }
450   PetscFunctionReturn(0);
451 }
452 
453 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
454    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
455  */
456 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
457 {
458   Mat_SeqAIJ            *bseq;
459   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
460   Mat                   mat;
461 
462   PetscFunctionBegin;
463   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
464   PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
465   mat  = *B;
466   if (A->assembled) {
467     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
468     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
469     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
470     /* Now copy values to B if needed */
471     if (dupOption == MAT_COPY_VALUES) {
472       if (akok->a_dual.need_sync_device()) {
473         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
474         bkok->a_dual.modify_host();
475       } else { /* If device has the latest data, we only copy data on device */
476         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
477         bkok->a_dual.modify_device();
478       }
479     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
480       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
481       bkok->a_dual.modify_host();
482     }
483     mat->spptr = bkok;
484   }
485 
486   PetscCall(PetscFree(mat->defaultvectype));
487   PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
488   PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
489   PetscCall(MatSetOps_SeqAIJKokkos(mat));
490   PetscFunctionReturn(0);
491 }
492 
493 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
494 {
495   Mat               At;
496   KokkosCsrMatrix   *internT;
497   Mat_SeqAIJKokkos  *atkok,*bkok;
498 
499   PetscFunctionBegin;
500   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
501   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
502     /* Deep copy internT, as we want to isolate the internal transpose */
503     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
504     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
505     if (reuse == MAT_INITIAL_MATRIX) *B = At;
506     else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
507   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
508     if ((*B)->assembled) {
509       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
510       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
511       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
512     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
513       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
514       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
515       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
516       PetscCallCXX(Kokkos::deep_copy(a_h,internT->values));
517       PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
518     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
519   }
520   PetscFunctionReturn(0);
521 }
522 
523 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
524 {
525   Mat_SeqAIJKokkos           *aijkok;
526 
527   PetscFunctionBegin;
528   if (A->factortype == MAT_FACTOR_NONE) {
529     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
530     delete aijkok;
531   } else {
532     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
533   }
534   A->spptr = NULL;
535   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
536   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
537   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
538   PetscCall(MatDestroy_SeqAIJ(A));
539   PetscFunctionReturn(0);
540 }
541 
542 /*MC
543    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
544 
545    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
546 
547    Options Database Keys:
548 .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
549 
550   Level: beginner
551 
552 .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS
553 M*/
554 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
555 {
556   PetscFunctionBegin;
557   PetscCall(PetscKokkosInitializeCheck());
558   PetscCall(MatCreate_SeqAIJ(A));
559   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
560   PetscFunctionReturn(0);
561 }
562 
563 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
564 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
565 {
566   Mat_SeqAIJ                   *a,*b;
567   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
568   MatScalarKokkosView          aa,ba,ca;
569   MatRowMapKokkosView          ai,bi,ci;
570   MatColIdxKokkosView          aj,bj,cj;
571   PetscInt                     m,n,nnz,aN;
572 
573   PetscFunctionBegin;
574   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
575   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
576   PetscValidPointer(C,4);
577   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
578   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
579   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
580   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
581 
582   PetscCall(MatSeqAIJKokkosSyncDevice(A));
583   PetscCall(MatSeqAIJKokkosSyncDevice(B));
584   a    = static_cast<Mat_SeqAIJ*>(A->data);
585   b    = static_cast<Mat_SeqAIJ*>(B->data);
586   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
587   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
588   aa   = akok->a_dual.view_device();
589   ai   = akok->i_dual.view_device();
590   ba   = bkok->a_dual.view_device();
591   bi   = bkok->i_dual.view_device();
592   m    = A->rmap->n; /* M, N and nnz of C */
593   n    = A->cmap->n + B->cmap->n;
594   nnz  = a->nz + b->nz;
595   aN   = A->cmap->n; /* N of A */
596   if (reuse == MAT_INITIAL_MATRIX) {
597     aj = akok->j_dual.view_device();
598     bj = bkok->j_dual.view_device();
599     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
600     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
601     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
602     ca = ca_dual.view_device();
603     ci = ci_dual.view_device();
604     cj = cj_dual.view_device();
605 
606     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
607     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
608       PetscInt i = t.league_rank(); /* row i */
609       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
610 
611       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
612         ci(i) = coffset;
613         if (i == m-1) ci(m) = ai(m) + bi(m);
614       });
615 
616       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
617         if (k < alen) {
618           ca(coffset+k) = aa(ai(i)+k);
619           cj(coffset+k) = aj(ai(i)+k);
620         } else {
621           ca(coffset+k) = ba(bi(i)+k-alen);
622           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
623         }
624       });
625     });
626     ca_dual.modify_device();
627     ci_dual.modify_device();
628     cj_dual.modify_device();
629     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
630     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
631   } else if (reuse == MAT_REUSE_MATRIX) {
632     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
633     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
634     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
635     ca   = ckok->a_dual.view_device();
636     ci   = ckok->i_dual.view_device();
637 
638     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
639       PetscInt i = t.league_rank(); /* row i */
640       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
641       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
642         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
643         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
644       });
645     });
646     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
647   }
648   PetscFunctionReturn(0);
649 }
650 
651 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
652 {
653   PetscFunctionBegin;
654   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
655   PetscFunctionReturn(0);
656 }
657 
658 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
659 {
660   Mat_Product                    *product = C->product;
661   Mat                            A,B;
662   bool                           transA,transB; /* use bool, since KK needs this type */
663   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
664   Mat_SeqAIJ                     *c;
665   MatProductData_SeqAIJKokkos    *pdata;
666   KokkosCsrMatrix                *csrmatA,*csrmatB;
667 
668   PetscFunctionBegin;
669   MatCheckProduct(C,1);
670   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
671   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
672 
673   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
674     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
675     PetscFunctionReturn(0);
676   }
677 
678   switch (product->type) {
679     case MATPRODUCT_AB:  transA = false; transB = false; break;
680     case MATPRODUCT_AtB: transA = true;  transB = false; break;
681     case MATPRODUCT_ABt: transA = false; transB = true;  break;
682     default:
683       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
684   }
685 
686   A     = product->A;
687   B     = product->B;
688   PetscCall(MatSeqAIJKokkosSyncDevice(A));
689   PetscCall(MatSeqAIJKokkosSyncDevice(B));
690   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
691   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
692   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
693 
694   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
695 
696   csrmatA = &akok->csrmat;
697   csrmatB = &bkok->csrmat;
698 
699   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
700   if (transA) {
701     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
702     transA = false;
703   }
704 
705   if (transB) {
706     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
707     transB = false;
708   }
709   PetscCall(PetscLogGpuTimeBegin());
710   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
711   PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
712   PetscCall(PetscLogGpuTimeEnd());
713   PetscCall(MatSeqAIJKokkosModifyDevice(C));
714   /* shorter version of MatAssemblyEnd_SeqAIJ */
715   c = (Mat_SeqAIJ*)C->data;
716   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
717   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
718   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
719   c->reallocs         = 0;
720   C->info.mallocs     = 0;
721   C->info.nz_unneeded = 0;
722   C->assembled        = C->was_assembled = PETSC_TRUE;
723   C->num_ass++;
724   PetscFunctionReturn(0);
725 }
726 
727 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
728 {
729   Mat_Product                    *product = C->product;
730   MatProductType                 ptype;
731   Mat                            A,B;
732   bool                           transA,transB;
733   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
734   MatProductData_SeqAIJKokkos    *pdata;
735   MPI_Comm                       comm;
736   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
737 
738   PetscFunctionBegin;
739   MatCheckProduct(C,1);
740   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
741   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
742   A       = product->A;
743   B       = product->B;
744   PetscCall(MatSeqAIJKokkosSyncDevice(A));
745   PetscCall(MatSeqAIJKokkosSyncDevice(B));
746   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
747   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
748   csrmatA = &akok->csrmat;
749   csrmatB = &bkok->csrmat;
750 
751   ptype   = product->type;
752   switch (ptype) {
753     case MATPRODUCT_AB:  transA = false; transB = false; break;
754     case MATPRODUCT_AtB: transA = true;  transB = false; break;
755     case MATPRODUCT_ABt: transA = false; transB = true;  break;
756     default:
757       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
758   }
759 
760   product->data = pdata = new MatProductData_SeqAIJKokkos();
761   pdata->kh.set_team_work_size(16);
762   pdata->kh.set_dynamic_scheduling(true);
763   pdata->reusesym = product->api_user;
764 
765   /* TODO: add command line options to select spgemm algorithms */
766   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
767   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
768      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
769    */
770   pdata->kh.create_spgemm_handle(spgemm_alg);
771 
772   PetscCall(PetscLogGpuTimeBegin());
773   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
774   if (transA) {
775     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
776     transA = false;
777   }
778 
779   if (transB) {
780     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
781     transB = false;
782   }
783 
784   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
785   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
786     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
787     calling new Mat_SeqAIJKokkos().
788     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
789   */
790   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
791   PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC));
792   PetscCall(PetscLogGpuTimeEnd());
793 
794   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
795   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
796   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
797   PetscFunctionReturn(0);
798 }
799 
800 /* handles sparse matrix matrix ops */
801 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
802 {
803   Mat_Product    *product = mat->product;
804   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
805 
806   PetscFunctionBegin;
807   MatCheckProduct(mat,1);
808   PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
809   if (product->type == MATPRODUCT_ABC) {
810     PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
811   }
812   if (Biskok && Ciskok) {
813     switch (product->type) {
814     case MATPRODUCT_AB:
815     case MATPRODUCT_AtB:
816     case MATPRODUCT_ABt:
817       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
818       break;
819     case MATPRODUCT_PtAP:
820     case MATPRODUCT_RARt:
821     case MATPRODUCT_ABC:
822       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
823       break;
824     default:
825       break;
826     }
827   } else { /* fallback for AIJ */
828     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
829   }
830   PetscFunctionReturn(0);
831 }
832 
833 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
834 {
835   Mat_SeqAIJKokkos *aijkok;
836 
837   PetscFunctionBegin;
838   PetscCall(PetscLogGpuTimeBegin());
839   PetscCall(MatSeqAIJKokkosSyncDevice(A));
840   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
841   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
842   PetscCall(MatSeqAIJKokkosModifyDevice(A));
843   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
844   PetscCall(PetscLogGpuTimeEnd());
845   PetscFunctionReturn(0);
846 }
847 
848 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
849 {
850   Mat_SeqAIJKokkos *aijkok;
851 
852   PetscFunctionBegin;
853   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
854   if (aijkok) { /* Only zero the device if data is already there */
855     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
856     PetscCall(MatSeqAIJKokkosModifyDevice(A));
857   } else { /* Might be preallocated but not assembled */
858     PetscCall(MatZeroEntries_SeqAIJ(A));
859   }
860   PetscFunctionReturn(0);
861 }
862 
863 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
864 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
865 {
866   Mat_SeqAIJKokkos   *aijkok;
867 
868   PetscFunctionBegin;
869   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
870   PetscValidPointer(kv,2);
871   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
872   PetscCall(MatSeqAIJKokkosSyncDevice(A));
873   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
874   *kv    = aijkok->a_dual.view_device();
875   PetscFunctionReturn(0);
876 }
877 
878 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
879 {
880   PetscFunctionBegin;
881   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
882   PetscValidPointer(kv,2);
883   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
884   PetscFunctionReturn(0);
885 }
886 
887 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
888 {
889   Mat_SeqAIJKokkos   *aijkok;
890 
891   PetscFunctionBegin;
892   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
893   PetscValidPointer(kv,2);
894   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
895   PetscCall(MatSeqAIJKokkosSyncDevice(A));
896   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
897   *kv    = aijkok->a_dual.view_device();
898   PetscFunctionReturn(0);
899 }
900 
901 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
902 {
903   PetscFunctionBegin;
904   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
905   PetscValidPointer(kv,2);
906   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
907   PetscCall(MatSeqAIJKokkosModifyDevice(A));
908   PetscFunctionReturn(0);
909 }
910 
911 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
912 {
913   Mat_SeqAIJKokkos   *aijkok;
914 
915   PetscFunctionBegin;
916   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
917   PetscValidPointer(kv,2);
918   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
919   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
920   *kv    = aijkok->a_dual.view_device();
921   PetscFunctionReturn(0);
922 }
923 
924 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
925 {
926   PetscFunctionBegin;
927   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
928   PetscValidPointer(kv,2);
929   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
930   PetscCall(MatSeqAIJKokkosModifyDevice(A));
931   PetscFunctionReturn(0);
932 }
933 
934 /* Computes Y += alpha X */
935 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
936 {
937   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
938   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
939   ConstMatScalarKokkosView   Xa;
940   MatScalarKokkosView        Ya;
941 
942   PetscFunctionBegin;
943   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
944   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
945   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
946   PetscCall(MatSeqAIJKokkosSyncDevice(X));
947   PetscCall(PetscLogGpuTimeBegin());
948 
949   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
950     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
951     PetscBool e;
952     PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
953     if (e) {
954       PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e));
955       if (e) pattern = SAME_NONZERO_PATTERN;
956     }
957   }
958 
959   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
960     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
961     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
962     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
963   */
964   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
965   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
966   Xa   = xkok->a_dual.view_device();
967   Ya   = ykok->a_dual.view_device();
968 
969   if (pattern == SAME_NONZERO_PATTERN) {
970     KokkosBlas::axpy(alpha,Xa,Ya);
971     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
972   } else if (pattern == SUBSET_NONZERO_PATTERN) {
973     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
974     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
975 
976     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
977       PetscInt i = t.league_rank(); /* row i */
978       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
979         PetscInt p,q = Yi(i);
980         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
981           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
982           if (Xj(p) == Yj(q)) { /* Found it */
983             Ya(q) += alpha * Xa(p);
984             q++;
985           } else {
986             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
987                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
988             */
989             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
990           }
991         }
992       });
993     });
994     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
995   } else { /* different nonzero patterns */
996     Mat             Z;
997     KokkosCsrMatrix zcsr;
998     KernelHandle    kh;
999     kh.create_spadd_handle(false);
1000     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1001     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1002     zkok = new Mat_SeqAIJKokkos(zcsr);
1003     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
1004     PetscCall(MatHeaderReplace(Y,&Z));
1005     kh.destroy_spadd_handle();
1006   }
1007   PetscCall(PetscLogGpuTimeEnd());
1008   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1009   PetscFunctionReturn(0);
1010 }
1011 
1012 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1013 {
1014   Mat_SeqAIJKokkos *akok;
1015   Mat_SeqAIJ       *aseq;
1016 
1017   PetscFunctionBegin;
1018   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1019   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1020   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1021   delete akok;
1022   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
1023   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1024   akok->SetUpCOO(aseq);
1025   PetscFunctionReturn(0);
1026 }
1027 
1028 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1029 {
1030   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1031   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1032   PetscCount                  Annz = aseq->nz;
1033   const PetscCountKokkosView& jmap = akok->jmap_d;
1034   const PetscCountKokkosView& perm = akok->perm_d;
1035   MatScalarKokkosView         Aa;
1036   ConstMatScalarKokkosView    kv;
1037   PetscMemType                memtype;
1038 
1039   PetscFunctionBegin;
1040   PetscCall(PetscGetMemType(v,&memtype));
1041   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1042     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1043   } else {
1044     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1045   }
1046 
1047   if (imode == INSERT_VALUES) {
1048     PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1049     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1050   } else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
1051 
1052   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));});
1053 
1054   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1055   else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa));
1056   PetscFunctionReturn(0);
1057 }
1058 
1059 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1060 {
1061   PetscFunctionBegin;
1062   PetscCall(MatSeqAIJKokkosSyncHost(A));
1063   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
1064   B->offloadmask = PETSC_OFFLOAD_CPU;
1065   PetscFunctionReturn(0);
1066 }
1067 
1068 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1069 {
1070   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1071 
1072   PetscFunctionBegin;
1073   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1074   A->boundtocpu  = PETSC_FALSE;
1075 
1076   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1077   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1078   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1079   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1080   A->ops->scale                     = MatScale_SeqAIJKokkos;
1081   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1082   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1083   A->ops->mult                      = MatMult_SeqAIJKokkos;
1084   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1085   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1086   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1087   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1088   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1089   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1090   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1091   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1092   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1093   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1094   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1095   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1096   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1097   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1098 
1099   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
1100   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1101   PetscFunctionReturn(0);
1102 }
1103 
1104 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1105 {
1106   Mat_SeqAIJ *aseq;
1107   PetscInt    i,m,n;
1108 
1109   PetscFunctionBegin;
1110   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1111 
1112   m = akok->nrows();
1113   n = akok->ncols();
1114   PetscCall(MatSetSizes(A,m,n,m,n));
1115   PetscCall(MatSetType(A,MATSEQAIJKOKKOS));
1116 
1117   /* Set up data structures of A as a MATSEQAIJ */
1118   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1119   aseq = (Mat_SeqAIJ*)(A)->data;
1120 
1121   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1122   akok->j_dual.sync_host();
1123 
1124   aseq->i            = akok->i_host_data();
1125   aseq->j            = akok->j_host_data();
1126   aseq->a            = akok->a_host_data();
1127   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1128   aseq->singlemalloc = PETSC_FALSE;
1129   aseq->free_a       = PETSC_FALSE;
1130   aseq->free_ij      = PETSC_FALSE;
1131   aseq->nz           = akok->nnz();
1132   aseq->maxnz        = aseq->nz;
1133 
1134   PetscCall(PetscMalloc1(m,&aseq->imax));
1135   PetscCall(PetscMalloc1(m,&aseq->ilen));
1136   for (i=0; i<m; i++) {
1137     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1138   }
1139 
1140   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1141   akok->nonzerostate = A->nonzerostate;
1142   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1143   PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
1144   PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1145   PetscFunctionReturn(0);
1146 }
1147 
1148 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1149 
1150    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1151  */
1152 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1153 {
1154   PetscFunctionBegin;
1155   PetscCall(MatCreate(comm,A));
1156   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
1157   PetscFunctionReturn(0);
1158 }
1159 
1160 /* --------------------------------------------------------------------------------*/
1161 /*@C
1162    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1163    (the default parallel PETSc format). This matrix will ultimately be handled by
1164    Kokkos for calculations. For good matrix
1165    assembly performance the user should preallocate the matrix storage by setting
1166    the parameter nz (or the array nnz).  By setting these parameters accurately,
1167    performance during matrix assembly can be increased by more than a factor of 50.
1168 
1169    Collective
1170 
1171    Input Parameters:
1172 +  comm - MPI communicator, set to PETSC_COMM_SELF
1173 .  m - number of rows
1174 .  n - number of columns
1175 .  nz - number of nonzeros per row (same for all rows)
1176 -  nnz - array containing the number of nonzeros in the various rows
1177          (possibly different for each row) or NULL
1178 
1179    Output Parameter:
1180 .  A - the matrix
1181 
1182    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1183    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1184    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1185 
1186    Notes:
1187    If nnz is given then nz is ignored
1188 
1189    The AIJ format (also called the Yale sparse matrix format or
1190    compressed row storage), is fully compatible with standard Fortran 77
1191    storage.  That is, the stored row and column indices can begin at
1192    either one (as in Fortran) or zero.  See the users' manual for details.
1193 
1194    Specify the preallocated storage with either nz or nnz (not both).
1195    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1196    allocation.  For large problems you MUST preallocate memory or you
1197    will get TERRIBLE performance, see the users' manual chapter on matrices.
1198 
1199    By default, this format uses inodes (identical nodes) when possible, to
1200    improve numerical efficiency of matrix-vector products and solves. We
1201    search for consecutive rows with the same nonzero structure, thereby
1202    reusing matrix information to achieve increased efficiency.
1203 
1204    Level: intermediate
1205 
1206 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1207 @*/
1208 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1209 {
1210   PetscFunctionBegin;
1211   PetscCall(PetscKokkosInitializeCheck());
1212   PetscCall(MatCreate(comm,A));
1213   PetscCall(MatSetSizes(*A,m,n,m,n));
1214   PetscCall(MatSetType(*A,MATSEQAIJKOKKOS));
1215   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
1216   PetscFunctionReturn(0);
1217 }
1218 
1219 typedef Kokkos::TeamPolicy<>::member_type team_member;
1220 //
1221 // This factorization exploits block diagonal matrices with "Nf" (not used).
1222 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1223 //
1224 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1225 {
1226   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1227   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1228   IS                 isrow = b->row,isicol = b->icol;
1229   const PetscInt     *r_h,*ic_h;
1230   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1231   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1232   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1233   PetscBool          row_identity,col_identity;
1234   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1235 
1236   PetscFunctionBegin;
1237   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1238   PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity));
1239   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1240   PetscCall(ISGetIndices(isrow,&r_h));
1241   PetscCall(ISGetIndices(isicol,&ic_h));
1242   PetscCall(ISGetSize(isicol,&nc));
1243   PetscCall(PetscLogGpuTimeBegin());
1244   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1245   {
1246 #define KOKKOS_SHARED_LEVEL 1
1247     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1248     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1249     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1250     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1251     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1252     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1253     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1254     size_t flops_h = 0.0;
1255     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1256     Kokkos::View<size_t> d_flops_k ("flops");
1257     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1258     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1259     Kokkos::deep_copy (d_flops_k, h_flops_k);
1260     Kokkos::deep_copy (d_r_k, h_r_k);
1261     Kokkos::deep_copy (d_ic_k, h_ic_k);
1262     // Fill A --> fact
1263     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1264         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1265         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1266         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1267         // zero rows of B
1268         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1269             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1270             PetscScalar *baL = ba_d + bi_d[rowb];
1271             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1272             /* zero (unfactored row) */
1273             for (int j=0;j<nzbL;j++) baL[j] = 0;
1274             for (int j=0;j<nzbU;j++) baU[j] = 0;
1275           });
1276         // copy A into B
1277         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1278             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1279             const PetscScalar *av    = aa_d + ai_d[rowa];
1280             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1281             /* load in initial (unfactored row) */
1282             for (int j=0;j<nza;j++) {
1283               PetscInt    colb = ic[ajtmp[j]];
1284               PetscScalar vala = av[j];
1285               if (colb == rowb) {
1286                 *(ba_d + bdiag_d[rowb]) = vala;
1287               } else {
1288                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1289                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1290                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1291                 for (int j=0; j<nz ; j++) {
1292                   if (pbj[j] == colb) {
1293                     pba[j] = vala;
1294                     set++;
1295                     break;
1296                   }
1297                 }
1298                #if !defined(PETSC_HAVE_SYCL)
1299                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1300                #endif
1301               }
1302             }
1303           });
1304       });
1305     Kokkos::fence();
1306 
1307     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1308         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1309         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1310         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1311         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1312         const PetscInt  start = field*nloc, end = start + nloc;
1313         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1314         // A22 panel update for each row A(1,:) and col A(:,1)
1315         for (int ii=start; ii<end-1; ii++) {
1316           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1317           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1318           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1319           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1320           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1321               PetscInt kIdx = j*Ni + field_block_idx;
1322               if (kIdx >= nzUi) /* void */ ;
1323               else {
1324                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1325                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1326                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1327                 size_t         st_idx;
1328                 // find and do L(k,i) = A(:k,i) / A(i,i)
1329                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1330                 // get column, there has got to be a better way
1331                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1332                     if (pjL[j] == ii) {
1333                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1334                       idx = j; // output
1335                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1336                     }
1337                 }, st_idx);
1338                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1339 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1340                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1341 #endif
1342                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1343                 // U(i+1,:end)
1344                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1345                       PetscScalar Uij = baUi[uiIdx];
1346                       PetscInt    col = bjUi[uiIdx];
1347                       if (col==myk) {
1348                         // A_kk = A_kk - L_ki * U_ij(k)
1349                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1350                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1351                       } else {
1352                         PetscScalar    *start, *end, *pAkjv=NULL;
1353                         PetscInt       high, low;
1354                         const PetscInt *startj;
1355                         if (col<myk) { // L
1356                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1357                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1358                           start = pLki+1; // start at pLki+1, A22(myk,1)
1359                           startj= bj_d + bi_d[myk] + idx;
1360                           end   = ba_d + bi_d[myk+1];
1361                         } else {
1362                           PetscInt idx = bdiag_d[myk+1]+1;
1363                           start = ba_d + idx;
1364                           startj= bj_d + idx;
1365                           end   = ba_d + bdiag_d[myk];
1366                         }
1367                         // search for 'col', use bisection search - TODO
1368                         low  = 0;
1369                         high = (PetscInt)(end-start);
1370                         while (high-low > 5) {
1371                           int t = (low+high)/2;
1372                           if (startj[t] > col) high = t;
1373                           else                 low  = t;
1374                         }
1375                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1376                           if (startj[pAkjv-start] == col) break;
1377                         }
1378 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1379                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1380 #endif
1381                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1382                       }
1383                     });
1384               }
1385             });
1386           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1387           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1388         } /* endof for (i=0; i<n; i++) { */
1389         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1390       });
1391     Kokkos::fence();
1392     Kokkos::deep_copy (h_flops_k, d_flops_k);
1393     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1394     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1395         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1396         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1397         /* Invert diagonal for simpler triangular solves */
1398         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1399             int i = start + outer_index*Ni + lg_rank%Ni;
1400             if (i < end) {
1401               PetscScalar *pv = ba_d + bdiag_d[i];
1402               *pv = 1.0/(*pv);
1403             }
1404           });
1405       });
1406   }
1407   PetscCall(PetscLogGpuTimeEnd());
1408   PetscCall(ISRestoreIndices(isicol,&ic_h));
1409   PetscCall(ISRestoreIndices(isrow,&r_h));
1410 
1411   PetscCall(ISIdentity(isrow,&row_identity));
1412   PetscCall(ISIdentity(isicol,&col_identity));
1413   if (b->inode.size) {
1414     B->ops->solve = MatSolve_SeqAIJ_Inode;
1415   } else if (row_identity && col_identity) {
1416     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1417   } else {
1418     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1419   }
1420   B->offloadmask = PETSC_OFFLOAD_GPU;
1421   PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
1422   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1423   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1424   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1425   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1426   B->assembled              = PETSC_TRUE;
1427   B->preallocated           = PETSC_TRUE;
1428 
1429   PetscFunctionReturn(0);
1430 }
1431 
1432 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1433 {
1434   PetscFunctionBegin;
1435   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1436   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1437   PetscFunctionReturn(0);
1438 }
1439 
1440 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1441 {
1442   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1443 
1444   PetscFunctionBegin;
1445   if (!factors->sptrsv_symbolic_completed) {
1446     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1447     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1448     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1449   }
1450   PetscFunctionReturn(0);
1451 }
1452 
1453 /* Check if we need to update factors etc for transpose solve */
1454 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1455 {
1456   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1457   MatColIdxType             n = A->rmap->n;
1458 
1459   PetscFunctionBegin;
1460   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1461     /* Update L^T and do sptrsv symbolic */
1462     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1463     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1464     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1465     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1466 
1467     KokkosKernels::Impl::transpose_matrix<
1468       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1469       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1470       MatRowMapKokkosView,DefaultExecutionSpace>(
1471         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1472         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1473 
1474     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1475       We have to sort the indices, until KK provides finer control options.
1476     */
1477     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1478       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1479         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1480 
1481     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1482 
1483     /* Update U^T and do sptrsv symbolic */
1484     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1485     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1486     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1487     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1488 
1489     KokkosKernels::Impl::transpose_matrix<
1490       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1491       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1492       MatRowMapKokkosView,DefaultExecutionSpace>(
1493         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1494         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1495 
1496     /* Sort indices. See comments above */
1497     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1498       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1499         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1500 
1501     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1502     factors->transpose_updated = PETSC_TRUE;
1503   }
1504   PetscFunctionReturn(0);
1505 }
1506 
1507 /* Solve Ax = b, with A = LU */
1508 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1509 {
1510   ConstPetscScalarKokkosView     bv;
1511   PetscScalarKokkosView          xv;
1512   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1513 
1514   PetscFunctionBegin;
1515   PetscCall(PetscLogGpuTimeBegin());
1516   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1517   PetscCall(VecGetKokkosView(b,&bv));
1518   PetscCall(VecGetKokkosViewWrite(x,&xv));
1519   /* Solve L tmpv = b */
1520   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1521   /* Solve Ux = tmpv */
1522   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1523   PetscCall(VecRestoreKokkosView(b,&bv));
1524   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1525   PetscCall(PetscLogGpuTimeEnd());
1526   PetscFunctionReturn(0);
1527 }
1528 
1529 /* Solve A^T x = b, where A^T = U^T L^T */
1530 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1531 {
1532   ConstPetscScalarKokkosView     bv;
1533   PetscScalarKokkosView          xv;
1534   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1535 
1536   PetscFunctionBegin;
1537   PetscCall(PetscLogGpuTimeBegin());
1538   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1539   PetscCall(VecGetKokkosView(b,&bv));
1540   PetscCall(VecGetKokkosViewWrite(x,&xv));
1541   /* Solve U^T tmpv = b */
1542   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1543 
1544   /* Solve L^T x = tmpv */
1545   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1546   PetscCall(VecRestoreKokkosView(b,&bv));
1547   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1548   PetscCall(PetscLogGpuTimeEnd());
1549   PetscFunctionReturn(0);
1550 }
1551 
1552 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1553 {
1554   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1555   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1556   PetscInt                       fill_lev = info->levels;
1557 
1558   PetscFunctionBegin;
1559   PetscCall(PetscLogGpuTimeBegin());
1560   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1561 
1562   auto a_d = aijkok->a_dual.view_device();
1563   auto i_d = aijkok->i_dual.view_device();
1564   auto j_d = aijkok->j_dual.view_device();
1565 
1566   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1567 
1568   B->assembled                       = PETSC_TRUE;
1569   B->preallocated                    = PETSC_TRUE;
1570   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1571   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1572   B->ops->matsolve                   = NULL;
1573   B->ops->matsolvetranspose          = NULL;
1574   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1575 
1576   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1577   factors->transpose_updated         = PETSC_FALSE;
1578   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1579   /* TODO: log flops, but how to know that? */
1580   PetscCall(PetscLogGpuTimeEnd());
1581   PetscFunctionReturn(0);
1582 }
1583 
1584 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1585 {
1586   Mat_SeqAIJKokkos               *aijkok;
1587   Mat_SeqAIJ                     *b;
1588   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1589   PetscInt                       fill_lev = info->levels;
1590   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1591   PetscInt                       n = A->rmap->n;
1592 
1593   PetscFunctionBegin;
1594   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1595   /* Rebuild factors */
1596   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1597   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1598 
1599   /* Create a spiluk handle and then do symbolic factorization */
1600   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1601   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1602 
1603   auto spiluk_handle = factors->kh.get_spiluk_handle();
1604 
1605   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1606   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1607   Kokkos::realloc(factors->iU_d,n+1);
1608   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1609 
1610   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1611   auto i_d = aijkok->i_dual.view_device();
1612   auto j_d = aijkok->j_dual.view_device();
1613   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1614   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1615 
1616   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1617   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1618   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1619   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1620 
1621   /* TODO: add options to select sptrsv algorithms */
1622   /* Create sptrsv handles for L, U and their transpose */
1623  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1624   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1625  #else
1626   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1627  #endif
1628 
1629   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1630   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1631   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1632   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1633 
1634   /* Fill fields of the factor matrix B */
1635   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
1636   b     = (Mat_SeqAIJ*)B->data;
1637   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1638   B->info.fill_ratio_given  = info->fill;
1639   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1640 
1641   B->offloadmask            = PETSC_OFFLOAD_GPU;
1642   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1643   PetscFunctionReturn(0);
1644 }
1645 
1646 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1647 {
1648   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1649   const PetscInt   nrows   = A->rmap->n;
1650 
1651   PetscFunctionBegin;
1652   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1653   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1654   // move B data into Kokkos
1655   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1656   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1657   {
1658     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1659     if (!baijkok->diag_d.extent(0)) {
1660       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1661       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1662       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1663     }
1664   }
1665   PetscFunctionReturn(0);
1666 }
1667 
1668 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1669 {
1670   PetscFunctionBegin;
1671   *type = MATSOLVERKOKKOS;
1672   PetscFunctionReturn(0);
1673 }
1674 
1675 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1676 {
1677   PetscFunctionBegin;
1678   *type = MATSOLVERKOKKOSDEVICE;
1679   PetscFunctionReturn(0);
1680 }
1681 
1682 /*MC
1683   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1684   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1685 
1686   Level: beginner
1687 
1688 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1689 M*/
1690 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1691 {
1692   PetscInt       n = A->rmap->n;
1693 
1694   PetscFunctionBegin;
1695   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1696   PetscCall(MatSetSizes(*B,n,n,n,n));
1697   (*B)->factortype = ftype;
1698   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1699   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1700 
1701   if (ftype == MAT_FACTOR_LU) {
1702     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1703     (*B)->canuseordering         = PETSC_TRUE;
1704     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1705   } else if (ftype == MAT_FACTOR_ILU) {
1706     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1707     (*B)->canuseordering         = PETSC_FALSE;
1708     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1709   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1710 
1711   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1712   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1713   PetscFunctionReturn(0);
1714 }
1715 
1716 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1717 {
1718   PetscInt       n = A->rmap->n;
1719 
1720   PetscFunctionBegin;
1721   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1722   PetscCall(MatSetSizes(*B,n,n,n,n));
1723   (*B)->factortype = ftype;
1724   (*B)->canuseordering = PETSC_TRUE;
1725   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1726   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1727 
1728   if (ftype == MAT_FACTOR_LU) {
1729     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1730     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1731   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1732 
1733   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1734   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
1735   PetscFunctionReturn(0);
1736 }
1737 
1738 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1739 {
1740   PetscFunctionBegin;
1741   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
1742   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
1743   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
1744   PetscFunctionReturn(0);
1745 }
1746 
1747 /* Utility to print out a KokkosCsrMatrix for debugging */
1748 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1749 {
1750   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1751   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1752   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1753   const PetscInt    *i = iv.data();
1754   const PetscInt    *j = jv.data();
1755   const PetscScalar *a = av.data();
1756   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1757 
1758   PetscFunctionBegin;
1759   PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1760   for (PetscInt k=0; k<m; k++) {
1761     PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1762     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1763       PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]));
1764     }
1765     PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n"));
1766   }
1767   PetscFunctionReturn(0);
1768 }
1769