xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 8e010fa18d00c25e58c23165efbc8aef665b8a2c)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscpkg_version.h>
4 #include <petsc/private/petscimpl.h>
5 #include <petsc/private/sfimpl.h>
6 #include <petscsystypes.h>
7 #include <petscerror.h>
8 
9 #include <Kokkos_Core.hpp>
10 #include <KokkosBlas.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 #include <KokkosBatched_LU_Decl.hpp>
18 #include <KokkosBatched_InverseLU_Decl.hpp>
19 
20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
21 
22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
23   #include <KokkosSparse_Utils.hpp>
24 using KokkosSparse::sort_crs_matrix;
25 using KokkosSparse::Impl::transpose_matrix;
26 #else
27   #include <KokkosKernels_Sorting.hpp>
28 using KokkosKernels::sort_crs_matrix;
29 using KokkosKernels::Impl::transpose_matrix;
30 #endif
31 
32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
33 
34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
35    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
36    In the latter case, it is important to set a_dual's sync state correctly.
37  */
38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
39 {
40   Mat_SeqAIJ       *aijseq;
41   Mat_SeqAIJKokkos *aijkok;
42 
43   PetscFunctionBegin;
44   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
45   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
46 
47   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
48   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
49 
50   /* If aijkok does not exist, we just copy i, j to device.
51      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
52      In both cases, we build a new aijkok structure.
53   */
54   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
55     delete aijkok;
56     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
57     A->spptr = aijkok;
58   }
59 
60   PetscFunctionReturn(PETSC_SUCCESS);
61 }
62 
63 /* Sync CSR data to device if not yet */
64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
65 {
66   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
67 
68   PetscFunctionBegin;
69   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
70   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
71   if (aijkok->a_dual.need_sync_device()) {
72     aijkok->a_dual.sync_device();
73     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
74     aijkok->hermitian_updated = PETSC_FALSE;
75   }
76   PetscFunctionReturn(PETSC_SUCCESS);
77 }
78 
79 /* Mark the CSR data on device as modified */
80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
81 {
82   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
83 
84   PetscFunctionBegin;
85   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
86   aijkok->a_dual.clear_sync_state();
87   aijkok->a_dual.modify_device();
88   aijkok->transpose_updated = PETSC_FALSE;
89   aijkok->hermitian_updated = PETSC_FALSE;
90   PetscCall(MatSeqAIJInvalidateDiagonal(A));
91   PetscCall(PetscObjectStateIncrease((PetscObject)A));
92   PetscFunctionReturn(PETSC_SUCCESS);
93 }
94 
95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
96 {
97   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
98   auto             &exec   = PetscGetKokkosExecutionSpace();
99 
100   PetscFunctionBegin;
101   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
102   /* We do not expect one needs factors on host  */
103   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
104   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
105   PetscCallCXX(aijkok->a_dual.sync_host(exec));
106   PetscCallCXX(exec.fence());
107   PetscFunctionReturn(PETSC_SUCCESS);
108 }
109 
110 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
111 {
112   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
113 
114   PetscFunctionBegin;
115   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
116     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
117     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
118     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
119   */
120   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
121     auto &exec = PetscGetKokkosExecutionSpace();
122     PetscCallCXX(aijkok->a_dual.sync_host(exec));
123     PetscCallCXX(exec.fence());
124     *array = aijkok->a_dual.view_host().data();
125   } else { /* Happens when calling MatSetValues on a newly created matrix */
126     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
127   }
128   PetscFunctionReturn(PETSC_SUCCESS);
129 }
130 
131 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
132 {
133   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
134 
135   PetscFunctionBegin;
136   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
137   PetscFunctionReturn(PETSC_SUCCESS);
138 }
139 
140 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
141 {
142   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
143 
144   PetscFunctionBegin;
145   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
146     auto &exec = PetscGetKokkosExecutionSpace();
147     PetscCallCXX(aijkok->a_dual.sync_host(exec));
148     PetscCallCXX(exec.fence());
149     *array = aijkok->a_dual.view_host().data();
150   } else {
151     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
152   }
153   PetscFunctionReturn(PETSC_SUCCESS);
154 }
155 
156 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
157 {
158   PetscFunctionBegin;
159   *array = NULL;
160   PetscFunctionReturn(PETSC_SUCCESS);
161 }
162 
163 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
164 {
165   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
166 
167   PetscFunctionBegin;
168   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
169     *array = aijkok->a_dual.view_host().data();
170   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
171     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
172   }
173   PetscFunctionReturn(PETSC_SUCCESS);
174 }
175 
176 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
177 {
178   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
179 
180   PetscFunctionBegin;
181   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
182     aijkok->a_dual.clear_sync_state();
183     aijkok->a_dual.modify_host();
184   }
185   PetscFunctionReturn(PETSC_SUCCESS);
186 }
187 
188 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
189 {
190   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
191 
192   PetscFunctionBegin;
193   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
194 
195   if (i) *i = aijkok->i_device_data();
196   if (j) *j = aijkok->j_device_data();
197   if (a) {
198     aijkok->a_dual.sync_device();
199     *a = aijkok->a_device_data();
200   }
201   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
202   PetscFunctionReturn(PETSC_SUCCESS);
203 }
204 
205 /*
206   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
207 
208   Input Parameter:
209 .  A       - the MATSEQAIJKOKKOS matrix
210 
211   Output Parameters:
212 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
213 -  T_d    - the transpose on device, whose value array is allocated but not initialized
214 */
215 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
216 {
217   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
218   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
219   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
220   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
221   MatRowMapType          *Ti = Ti_h.data();
222   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
223   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
224   PetscInt               *Tj   = Tj_h.data();
225   PetscInt               *perm = perm_h.data();
226   PetscInt               *offset;
227 
228   PetscFunctionBegin;
229   // Populate Ti
230   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
231   Ti++;
232   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
233   Ti--;
234   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
235 
236   // Populate Tj and the permutation array
237   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
238   for (PetscInt i = 0; i < m; i++) {
239     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
240       PetscInt r    = Aj[j];                       // row r of T
241       PetscInt disp = Ti[r] + offset[r];
242 
243       Tj[disp]   = i; // col i of T
244       perm[disp] = j;
245       offset[r]++;
246     }
247   }
248   PetscCall(PetscFree(offset));
249 
250   // Sort each row of T, along with the permutation array
251   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
252 
253   // Output perm and T on device
254   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
255   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
256   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
257   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
258   PetscFunctionReturn(PETSC_SUCCESS);
259 }
260 
261 // Generate the transpose on device and cache it internally
262 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
263 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
264 {
265   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
266   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
267   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
268   KokkosCsrMatrix  &T = akok->csrmatT;
269 
270   PetscFunctionBegin;
271   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
272   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
273 
274   const auto &Aa = akok->a_dual.view_device();
275 
276   if (A->symmetric == PETSC_BOOL3_TRUE) {
277     *csrmatT = akok->csrmat;
278   } else {
279     // See if we already have a cached transpose and its value is up to date
280     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
281       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
282         const auto &perm = akok->transpose_perm; // get the permutation array
283         auto       &Ta   = T.values;
284 
285         PetscCallCXX(Kokkos::parallel_for(
286           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
287       }
288     } else { // Generate T of size n x m for the first time
289       MatRowMapKokkosView perm;
290 
291       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
292       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
293       PetscCallCXX(Kokkos::parallel_for(
294         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
295     }
296     akok->transpose_updated = PETSC_TRUE;
297     *csrmatT                = akok->csrmatT;
298   }
299   PetscFunctionReturn(PETSC_SUCCESS);
300 }
301 
302 // Generate the Hermitian on device and cache it internally
303 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
304 {
305   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
306   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
307   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
308   KokkosCsrMatrix  &T = akok->csrmatH;
309 
310   PetscFunctionBegin;
311   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
312   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
313 
314   const auto &Aa = akok->a_dual.view_device();
315 
316   if (A->hermitian == PETSC_BOOL3_TRUE) {
317     *csrmatH = akok->csrmat;
318   } else {
319     // See if we already have a cached hermitian and its value is up to date
320     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
321       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
322         const auto &perm = akok->transpose_perm; // get the permutation array
323         auto       &Ta   = T.values;
324 
325         PetscCallCXX(Kokkos::parallel_for(
326           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
327       }
328     } else { // Generate T of size n x m for the first time
329       MatRowMapKokkosView perm;
330 
331       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
332       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
333       PetscCallCXX(Kokkos::parallel_for(
334         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
335     }
336     akok->hermitian_updated = PETSC_TRUE;
337     *csrmatH                = akok->csrmatH;
338   }
339   PetscFunctionReturn(PETSC_SUCCESS);
340 }
341 
342 /* y = A x */
343 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
344 {
345   Mat_SeqAIJKokkos          *aijkok;
346   ConstPetscScalarKokkosView xv;
347   PetscScalarKokkosView      yv;
348 
349   PetscFunctionBegin;
350   PetscCall(PetscLogGpuTimeBegin());
351   PetscCall(MatSeqAIJKokkosSyncDevice(A));
352   PetscCall(VecGetKokkosView(xx, &xv));
353   PetscCall(VecGetKokkosViewWrite(yy, &yv));
354   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
355   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
356   PetscCall(VecRestoreKokkosView(xx, &xv));
357   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
358   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
359   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
360   PetscCall(PetscLogGpuTimeEnd());
361   PetscFunctionReturn(PETSC_SUCCESS);
362 }
363 
364 /* y = A^T x */
365 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
366 {
367   Mat_SeqAIJKokkos          *aijkok;
368   const char                *mode;
369   ConstPetscScalarKokkosView xv;
370   PetscScalarKokkosView      yv;
371   KokkosCsrMatrix            csrmat;
372 
373   PetscFunctionBegin;
374   PetscCall(PetscLogGpuTimeBegin());
375   PetscCall(MatSeqAIJKokkosSyncDevice(A));
376   PetscCall(VecGetKokkosView(xx, &xv));
377   PetscCall(VecGetKokkosViewWrite(yy, &yv));
378   if (A->form_explicit_transpose) {
379     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
380     mode = "N";
381   } else {
382     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
383     csrmat = aijkok->csrmat;
384     mode   = "T";
385   }
386   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
387   PetscCall(VecRestoreKokkosView(xx, &xv));
388   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
389   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
390   PetscCall(PetscLogGpuTimeEnd());
391   PetscFunctionReturn(PETSC_SUCCESS);
392 }
393 
394 /* y = A^H x */
395 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
396 {
397   Mat_SeqAIJKokkos          *aijkok;
398   const char                *mode;
399   ConstPetscScalarKokkosView xv;
400   PetscScalarKokkosView      yv;
401   KokkosCsrMatrix            csrmat;
402 
403   PetscFunctionBegin;
404   PetscCall(PetscLogGpuTimeBegin());
405   PetscCall(MatSeqAIJKokkosSyncDevice(A));
406   PetscCall(VecGetKokkosView(xx, &xv));
407   PetscCall(VecGetKokkosViewWrite(yy, &yv));
408   if (A->form_explicit_transpose) {
409     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
410     mode = "N";
411   } else {
412     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
413     csrmat = aijkok->csrmat;
414     mode   = "C";
415   }
416   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
417   PetscCall(VecRestoreKokkosView(xx, &xv));
418   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
419   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
420   PetscCall(PetscLogGpuTimeEnd());
421   PetscFunctionReturn(PETSC_SUCCESS);
422 }
423 
424 /* z = A x + y */
425 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
426 {
427   Mat_SeqAIJKokkos          *aijkok;
428   ConstPetscScalarKokkosView xv, yv;
429   PetscScalarKokkosView      zv;
430 
431   PetscFunctionBegin;
432   PetscCall(PetscLogGpuTimeBegin());
433   PetscCall(MatSeqAIJKokkosSyncDevice(A));
434   PetscCall(VecGetKokkosView(xx, &xv));
435   PetscCall(VecGetKokkosView(yy, &yv));
436   PetscCall(VecGetKokkosViewWrite(zz, &zv));
437   if (zz != yy) Kokkos::deep_copy(zv, yv);
438   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
439   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
440   PetscCall(VecRestoreKokkosView(xx, &xv));
441   PetscCall(VecRestoreKokkosView(yy, &yv));
442   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
443   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
444   PetscCall(PetscLogGpuTimeEnd());
445   PetscFunctionReturn(PETSC_SUCCESS);
446 }
447 
448 /* z = A^T x + y */
449 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
450 {
451   Mat_SeqAIJKokkos          *aijkok;
452   const char                *mode;
453   ConstPetscScalarKokkosView xv, yv;
454   PetscScalarKokkosView      zv;
455   KokkosCsrMatrix            csrmat;
456 
457   PetscFunctionBegin;
458   PetscCall(PetscLogGpuTimeBegin());
459   PetscCall(MatSeqAIJKokkosSyncDevice(A));
460   PetscCall(VecGetKokkosView(xx, &xv));
461   PetscCall(VecGetKokkosView(yy, &yv));
462   PetscCall(VecGetKokkosViewWrite(zz, &zv));
463   if (zz != yy) Kokkos::deep_copy(zv, yv);
464   if (A->form_explicit_transpose) {
465     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
466     mode = "N";
467   } else {
468     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
469     csrmat = aijkok->csrmat;
470     mode   = "T";
471   }
472   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
473   PetscCall(VecRestoreKokkosView(xx, &xv));
474   PetscCall(VecRestoreKokkosView(yy, &yv));
475   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
476   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
477   PetscCall(PetscLogGpuTimeEnd());
478   PetscFunctionReturn(PETSC_SUCCESS);
479 }
480 
481 /* z = A^H x + y */
482 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
483 {
484   Mat_SeqAIJKokkos          *aijkok;
485   const char                *mode;
486   ConstPetscScalarKokkosView xv, yv;
487   PetscScalarKokkosView      zv;
488   KokkosCsrMatrix            csrmat;
489 
490   PetscFunctionBegin;
491   PetscCall(PetscLogGpuTimeBegin());
492   PetscCall(MatSeqAIJKokkosSyncDevice(A));
493   PetscCall(VecGetKokkosView(xx, &xv));
494   PetscCall(VecGetKokkosView(yy, &yv));
495   PetscCall(VecGetKokkosViewWrite(zz, &zv));
496   if (zz != yy) Kokkos::deep_copy(zv, yv);
497   if (A->form_explicit_transpose) {
498     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
499     mode = "N";
500   } else {
501     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
502     csrmat = aijkok->csrmat;
503     mode   = "C";
504   }
505   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
506   PetscCall(VecRestoreKokkosView(xx, &xv));
507   PetscCall(VecRestoreKokkosView(yy, &yv));
508   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
509   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
510   PetscCall(PetscLogGpuTimeEnd());
511   PetscFunctionReturn(PETSC_SUCCESS);
512 }
513 
514 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
515 {
516   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
517 
518   PetscFunctionBegin;
519   switch (op) {
520   case MAT_FORM_EXPLICIT_TRANSPOSE:
521     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
522     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
523     A->form_explicit_transpose = flg;
524     break;
525   default:
526     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
527     break;
528   }
529   PetscFunctionReturn(PETSC_SUCCESS);
530 }
531 
532 /* Depending on reuse, either build a new mat, or use the existing mat */
533 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
534 {
535   Mat_SeqAIJ *aseq;
536 
537   PetscFunctionBegin;
538   PetscCall(PetscKokkosInitializeCheck());
539   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
540     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
541   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
542     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
543   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
544     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
545     PetscCall(PetscFree(A->defaultvectype));
546     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
547     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
548     PetscCall(MatSetOps_SeqAIJKokkos(A));
549     aseq = static_cast<Mat_SeqAIJ *>(A->data);
550     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
551       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
552       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
553     }
554   }
555   PetscFunctionReturn(PETSC_SUCCESS);
556 }
557 
558 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
559    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
560  */
561 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
562 {
563   Mat_SeqAIJ       *bseq;
564   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
565   Mat               mat;
566 
567   PetscFunctionBegin;
568   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
569   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
570   mat = *B;
571   if (A->preallocated) {
572     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
573     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
574     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
575     /* Now copy values to B if needed */
576     if (dupOption == MAT_COPY_VALUES) {
577       if (akok->a_dual.need_sync_device()) {
578         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
579         bkok->a_dual.modify_host();
580       } else { /* If device has the latest data, we only copy data on device */
581         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
582         bkok->a_dual.modify_device();
583       }
584     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
585       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
586       bkok->a_dual.modify_host();
587     }
588     mat->spptr = bkok;
589   }
590 
591   PetscCall(PetscFree(mat->defaultvectype));
592   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
593   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
594   PetscCall(MatSetOps_SeqAIJKokkos(mat));
595   PetscFunctionReturn(PETSC_SUCCESS);
596 }
597 
598 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
599 {
600   Mat               At;
601   KokkosCsrMatrix   internT;
602   Mat_SeqAIJKokkos *atkok, *bkok;
603 
604   PetscFunctionBegin;
605   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
606   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
607   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
608     /* Deep copy internT, as we want to isolate the internal transpose */
609     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
610     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
611     if (reuse == MAT_INITIAL_MATRIX) *B = At;
612     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
613   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
614     if ((*B)->assembled) {
615       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
616       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
617       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
618     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
619       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
620       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
621       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
622       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
623       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
624     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
625   }
626   PetscFunctionReturn(PETSC_SUCCESS);
627 }
628 
629 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
630 {
631   Mat_SeqAIJKokkos *aijkok;
632 
633   PetscFunctionBegin;
634   if (A->factortype == MAT_FACTOR_NONE) {
635     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
636     delete aijkok;
637   } else {
638     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
639   }
640   A->spptr = NULL;
641   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
642   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
643   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
644   PetscCall(MatDestroy_SeqAIJ(A));
645   PetscFunctionReturn(PETSC_SUCCESS);
646 }
647 
648 /*MC
649    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
650 
651    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
652 
653    Options Database Key:
654 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
655 
656   Level: beginner
657 
658 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
659 M*/
660 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
661 {
662   PetscFunctionBegin;
663   PetscCall(PetscKokkosInitializeCheck());
664   PetscCall(MatCreate_SeqAIJ(A));
665   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
666   PetscFunctionReturn(PETSC_SUCCESS);
667 }
668 
669 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
670 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
671 {
672   Mat_SeqAIJ         *a, *b;
673   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
674   MatScalarKokkosView aa, ba, ca;
675   MatRowMapKokkosView ai, bi, ci;
676   MatColIdxKokkosView aj, bj, cj;
677   PetscInt            m, n, nnz, aN;
678 
679   PetscFunctionBegin;
680   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
681   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
682   PetscAssertPointer(C, 4);
683   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
684   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
685   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
686   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
687 
688   PetscCall(MatSeqAIJKokkosSyncDevice(A));
689   PetscCall(MatSeqAIJKokkosSyncDevice(B));
690   a    = static_cast<Mat_SeqAIJ *>(A->data);
691   b    = static_cast<Mat_SeqAIJ *>(B->data);
692   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
693   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
694   aa   = akok->a_dual.view_device();
695   ai   = akok->i_dual.view_device();
696   ba   = bkok->a_dual.view_device();
697   bi   = bkok->i_dual.view_device();
698   m    = A->rmap->n; /* M, N and nnz of C */
699   n    = A->cmap->n + B->cmap->n;
700   nnz  = a->nz + b->nz;
701   aN   = A->cmap->n; /* N of A */
702   if (reuse == MAT_INITIAL_MATRIX) {
703     aj           = akok->j_dual.view_device();
704     bj           = bkok->j_dual.view_device();
705     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
706     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
707     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
708     ca           = ca_dual.view_device();
709     ci           = ci_dual.view_device();
710     cj           = cj_dual.view_device();
711 
712     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
713     Kokkos::parallel_for(
714       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
715         PetscInt i       = t.league_rank(); /* row i */
716         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
717 
718         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
719                                                    ci(i) = coffset;
720                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
721         });
722 
723         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
724           if (k < alen) {
725             ca(coffset + k) = aa(ai(i) + k);
726             cj(coffset + k) = aj(ai(i) + k);
727           } else {
728             ca(coffset + k) = ba(bi(i) + k - alen);
729             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
730           }
731         });
732       });
733     ca_dual.modify_device();
734     ci_dual.modify_device();
735     cj_dual.modify_device();
736     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
737     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
738   } else if (reuse == MAT_REUSE_MATRIX) {
739     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
740     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
741     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
742     ca   = ckok->a_dual.view_device();
743     ci   = ckok->i_dual.view_device();
744 
745     Kokkos::parallel_for(
746       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
747         PetscInt i    = t.league_rank(); /* row i */
748         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
749         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
750           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
751           else ca(ci(i) + k) = ba(bi(i) + k - alen);
752         });
753       });
754     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
755   }
756   PetscFunctionReturn(PETSC_SUCCESS);
757 }
758 
759 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
760 {
761   PetscFunctionBegin;
762   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
763   PetscFunctionReturn(PETSC_SUCCESS);
764 }
765 
766 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
767 {
768   Mat_Product                 *product = C->product;
769   Mat                          A, B;
770   bool                         transA, transB; /* use bool, since KK needs this type */
771   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
772   Mat_SeqAIJ                  *c;
773   MatProductData_SeqAIJKokkos *pdata;
774   KokkosCsrMatrix              csrmatA, csrmatB;
775 
776   PetscFunctionBegin;
777   MatCheckProduct(C, 1);
778   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
779   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
780 
781   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
782   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
783   // we still do numeric.
784   if (pdata->reusesym) { // numeric reuses results from symbolic
785     pdata->reusesym = PETSC_FALSE;
786     PetscFunctionReturn(PETSC_SUCCESS);
787   }
788 
789   switch (product->type) {
790   case MATPRODUCT_AB:
791     transA = false;
792     transB = false;
793     break;
794   case MATPRODUCT_AtB:
795     transA = true;
796     transB = false;
797     break;
798   case MATPRODUCT_ABt:
799     transA = false;
800     transB = true;
801     break;
802   default:
803     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
804   }
805 
806   A = product->A;
807   B = product->B;
808   PetscCall(MatSeqAIJKokkosSyncDevice(A));
809   PetscCall(MatSeqAIJKokkosSyncDevice(B));
810   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
811   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
812   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
813 
814   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
815 
816   csrmatA = akok->csrmat;
817   csrmatB = bkok->csrmat;
818 
819   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
820   if (transA) {
821     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
822     transA = false;
823   }
824 
825   if (transB) {
826     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
827     transB = false;
828   }
829   PetscCall(PetscLogGpuTimeBegin());
830   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
831 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
832   auto spgemmHandle = pdata->kh.get_spgemm_handle();
833   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
834 #endif
835 
836   PetscCall(PetscLogGpuTimeEnd());
837   PetscCall(MatSeqAIJKokkosModifyDevice(C));
838   /* shorter version of MatAssemblyEnd_SeqAIJ */
839   c = (Mat_SeqAIJ *)C->data;
840   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
841   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
842   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
843   c->reallocs         = 0;
844   C->info.mallocs     = 0;
845   C->info.nz_unneeded = 0;
846   C->assembled = C->was_assembled = PETSC_TRUE;
847   C->num_ass++;
848   PetscFunctionReturn(PETSC_SUCCESS);
849 }
850 
851 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
852 {
853   Mat_Product                 *product = C->product;
854   MatProductType               ptype;
855   Mat                          A, B;
856   bool                         transA, transB;
857   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
858   MatProductData_SeqAIJKokkos *pdata;
859   MPI_Comm                     comm;
860   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
861 
862   PetscFunctionBegin;
863   MatCheckProduct(C, 1);
864   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
865   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
866   A = product->A;
867   B = product->B;
868   PetscCall(MatSeqAIJKokkosSyncDevice(A));
869   PetscCall(MatSeqAIJKokkosSyncDevice(B));
870   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
871   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
872   csrmatA = akok->csrmat;
873   csrmatB = bkok->csrmat;
874 
875   ptype = product->type;
876   // Take advantage of the symmetry if true
877   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
878     ptype                                          = MATPRODUCT_AB;
879     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
880   }
881   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
882     ptype                                          = MATPRODUCT_AB;
883     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
884   }
885 
886   switch (ptype) {
887   case MATPRODUCT_AB:
888     transA = false;
889     transB = false;
890     break;
891   case MATPRODUCT_AtB:
892     transA = true;
893     transB = false;
894     break;
895   case MATPRODUCT_ABt:
896     transA = false;
897     transB = true;
898     break;
899   default:
900     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
901   }
902   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
903   pdata->reusesym = product->api_user;
904 
905   /* TODO: add command line options to select spgemm algorithms */
906   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
907 
908   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
909 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
910   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
911   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
912   #endif
913 #endif
914   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
915 
916   PetscCall(PetscLogGpuTimeBegin());
917   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
918   if (transA) {
919     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
920     transA = false;
921   }
922 
923   if (transB) {
924     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
925     transB = false;
926   }
927 
928   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
929   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
930     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
931     calling new Mat_SeqAIJKokkos().
932     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
933   */
934   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
935 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
936   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
937   auto spgemmHandle = pdata->kh.get_spgemm_handle();
938   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
939 #endif
940   PetscCall(PetscLogGpuTimeEnd());
941 
942   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
943   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
944   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
945   PetscFunctionReturn(PETSC_SUCCESS);
946 }
947 
948 /* handles sparse matrix matrix ops */
949 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
950 {
951   Mat_Product *product = mat->product;
952   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
953 
954   PetscFunctionBegin;
955   MatCheckProduct(mat, 1);
956   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
957   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
958   if (Biskok && Ciskok) {
959     switch (product->type) {
960     case MATPRODUCT_AB:
961     case MATPRODUCT_AtB:
962     case MATPRODUCT_ABt:
963       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
964       break;
965     case MATPRODUCT_PtAP:
966     case MATPRODUCT_RARt:
967     case MATPRODUCT_ABC:
968       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
969       break;
970     default:
971       break;
972     }
973   } else { /* fallback for AIJ */
974     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
975   }
976   PetscFunctionReturn(PETSC_SUCCESS);
977 }
978 
979 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
980 {
981   Mat_SeqAIJKokkos *aijkok;
982 
983   PetscFunctionBegin;
984   PetscCall(PetscLogGpuTimeBegin());
985   PetscCall(MatSeqAIJKokkosSyncDevice(A));
986   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
987   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
988   PetscCall(MatSeqAIJKokkosModifyDevice(A));
989   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
990   PetscCall(PetscLogGpuTimeEnd());
991   PetscFunctionReturn(PETSC_SUCCESS);
992 }
993 
994 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
995 {
996   Mat_SeqAIJKokkos *aijkok;
997 
998   PetscFunctionBegin;
999   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1000   if (aijkok) { /* Only zero the device if data is already there */
1001     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
1002     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1003   } else { /* Might be preallocated but not assembled */
1004     PetscCall(MatZeroEntries_SeqAIJ(A));
1005   }
1006   PetscFunctionReturn(PETSC_SUCCESS);
1007 }
1008 
1009 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1010 {
1011   Mat_SeqAIJ           *aijseq;
1012   Mat_SeqAIJKokkos     *aijkok;
1013   PetscInt              n;
1014   PetscScalarKokkosView xv;
1015 
1016   PetscFunctionBegin;
1017   PetscCall(VecGetLocalSize(x, &n));
1018   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1019   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1020 
1021   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1022   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1023 
1024   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
1025     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1026     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1027     aijkok->SetDiagonal(aijseq->diag);
1028   }
1029 
1030   const auto &Aa    = aijkok->a_dual.view_device();
1031   const auto &Ai    = aijkok->i_dual.view_device();
1032   const auto &Adiag = aijkok->diag_dual.view_device();
1033 
1034   PetscCall(VecGetKokkosViewWrite(x, &xv));
1035   Kokkos::parallel_for(
1036     n, KOKKOS_LAMBDA(const PetscInt i) {
1037       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1038       else xv(i) = 0;
1039     });
1040   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1041   PetscFunctionReturn(PETSC_SUCCESS);
1042 }
1043 
1044 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1045 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1046 {
1047   Mat_SeqAIJKokkos *aijkok;
1048 
1049   PetscFunctionBegin;
1050   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1051   PetscAssertPointer(kv, 2);
1052   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1053   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1054   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1055   *kv    = aijkok->a_dual.view_device();
1056   PetscFunctionReturn(PETSC_SUCCESS);
1057 }
1058 
1059 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1060 {
1061   PetscFunctionBegin;
1062   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1063   PetscAssertPointer(kv, 2);
1064   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1065   PetscFunctionReturn(PETSC_SUCCESS);
1066 }
1067 
1068 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1069 {
1070   Mat_SeqAIJKokkos *aijkok;
1071 
1072   PetscFunctionBegin;
1073   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1074   PetscAssertPointer(kv, 2);
1075   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1076   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1077   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1078   *kv    = aijkok->a_dual.view_device();
1079   PetscFunctionReturn(PETSC_SUCCESS);
1080 }
1081 
1082 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1083 {
1084   PetscFunctionBegin;
1085   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1086   PetscAssertPointer(kv, 2);
1087   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1088   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1089   PetscFunctionReturn(PETSC_SUCCESS);
1090 }
1091 
1092 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1093 {
1094   Mat_SeqAIJKokkos *aijkok;
1095 
1096   PetscFunctionBegin;
1097   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1098   PetscAssertPointer(kv, 2);
1099   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1100   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1101   *kv    = aijkok->a_dual.view_device();
1102   PetscFunctionReturn(PETSC_SUCCESS);
1103 }
1104 
1105 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1106 {
1107   PetscFunctionBegin;
1108   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1109   PetscAssertPointer(kv, 2);
1110   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1111   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1112   PetscFunctionReturn(PETSC_SUCCESS);
1113 }
1114 
1115 /* Computes Y += alpha X */
1116 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1117 {
1118   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1119   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1120   ConstMatScalarKokkosView Xa;
1121   MatScalarKokkosView      Ya;
1122 
1123   PetscFunctionBegin;
1124   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1125   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1126   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1127   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1128   PetscCall(PetscLogGpuTimeBegin());
1129 
1130   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1131     PetscBool e;
1132     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1133     if (e) {
1134       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1135       if (e) pattern = SAME_NONZERO_PATTERN;
1136     }
1137   }
1138 
1139   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1140     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1141     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1142     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1143   */
1144   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1145   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1146   Xa   = xkok->a_dual.view_device();
1147   Ya   = ykok->a_dual.view_device();
1148 
1149   if (pattern == SAME_NONZERO_PATTERN) {
1150     KokkosBlas::axpy(alpha, Xa, Ya);
1151     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1152   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1153     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1154     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1155 
1156     Kokkos::parallel_for(
1157       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1158         PetscInt i = t.league_rank(); // row i
1159         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1160           // Only one thread works in a team
1161           PetscInt p, q = Yi(i);
1162           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1163             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1164             if (Xj(p) == Yj(q)) {                        // Found it
1165               Ya(q) += alpha * Xa(p);
1166               q++;
1167             } else {
1168             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1169             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1170 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1171               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1172 #else
1173               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1174 #endif
1175             }
1176           }
1177         });
1178       });
1179     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1180   } else { // different nonzero patterns
1181     Mat             Z;
1182     KokkosCsrMatrix zcsr;
1183     KernelHandle    kh;
1184     kh.create_spadd_handle(true); // X, Y are sorted
1185     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1186     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1187     zkok = new Mat_SeqAIJKokkos(zcsr);
1188     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1189     PetscCall(MatHeaderReplace(Y, &Z));
1190     kh.destroy_spadd_handle();
1191   }
1192   PetscCall(PetscLogGpuTimeEnd());
1193   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1194   PetscFunctionReturn(PETSC_SUCCESS);
1195 }
1196 
1197 struct MatCOOStruct_SeqAIJKokkos {
1198   PetscCount           n;
1199   PetscCount           Atot;
1200   PetscInt             nz;
1201   PetscCountKokkosView jmap;
1202   PetscCountKokkosView perm;
1203 
1204   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1205   {
1206     nz   = coo_h->nz;
1207     n    = coo_h->n;
1208     Atot = coo_h->Atot;
1209     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1210     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1211   }
1212 };
1213 
1214 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
1215 {
1216   PetscFunctionBegin;
1217   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
1218   PetscFunctionReturn(PETSC_SUCCESS);
1219 }
1220 
1221 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1222 {
1223   Mat_SeqAIJKokkos          *akok;
1224   Mat_SeqAIJ                *aseq;
1225   PetscContainer             container_h, container_d;
1226   MatCOOStruct_SeqAIJ       *coo_h;
1227   MatCOOStruct_SeqAIJKokkos *coo_d;
1228 
1229   PetscFunctionBegin;
1230   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1231   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1232   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1233   delete akok;
1234   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1235   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1236 
1237   // Copy the COO struct to device
1238   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1239   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1240   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1241 
1242   // Put the COO struct in a container and then attach that to the matrix
1243   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
1244   PetscCall(PetscContainerSetPointer(container_d, coo_d));
1245   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos));
1246   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
1247   PetscCall(PetscContainerDestroy(&container_d));
1248   PetscFunctionReturn(PETSC_SUCCESS);
1249 }
1250 
1251 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1252 {
1253   MatScalarKokkosView        Aa;
1254   ConstMatScalarKokkosView   kv;
1255   PetscMemType               memtype;
1256   PetscContainer             container;
1257   MatCOOStruct_SeqAIJKokkos *coo;
1258 
1259   PetscFunctionBegin;
1260   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1261   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1262 
1263   const auto &n    = coo->n;
1264   const auto &Annz = coo->nz;
1265   const auto &jmap = coo->jmap;
1266   const auto &perm = coo->perm;
1267 
1268   PetscCall(PetscGetMemType(v, &memtype));
1269   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1270     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1271   } else {
1272     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1273   }
1274 
1275   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1276   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1277 
1278   PetscCall(PetscLogGpuTimeBegin());
1279   Kokkos::parallel_for(
1280     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1281       PetscScalar sum = 0.0;
1282       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1283       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1284     });
1285   PetscCall(PetscLogGpuTimeEnd());
1286 
1287   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1288   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1289   PetscFunctionReturn(PETSC_SUCCESS);
1290 }
1291 
1292 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1293 {
1294   PetscFunctionBegin;
1295   PetscCall(MatSeqAIJKokkosSyncHost(A));
1296   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1297   B->offloadmask = PETSC_OFFLOAD_CPU;
1298   PetscFunctionReturn(PETSC_SUCCESS);
1299 }
1300 
1301 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1302 {
1303   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1304 
1305   PetscFunctionBegin;
1306   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1307   A->boundtocpu  = PETSC_FALSE;
1308 
1309   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1310   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1311   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1312   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1313   A->ops->scale                     = MatScale_SeqAIJKokkos;
1314   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1315   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1316   A->ops->mult                      = MatMult_SeqAIJKokkos;
1317   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1318   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1319   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1320   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1321   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1322   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1323   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1324   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1325   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1326   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1327   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1328   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1329   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1330   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1331   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1332   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1333 
1334   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1335   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1336   PetscFunctionReturn(PETSC_SUCCESS);
1337 }
1338 
1339 /*
1340    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1341 
1342   Input Parameters:
1343 +  A       - the MATSEQAIJKOKKOS matrix
1344 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1345 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1346 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1347 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1348 
1349   Output Parameter:
1350 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1351 */
1352 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1353 {
1354   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1355   PetscInt          N       = A->rmap->n;
1356   PetscInt          nblocks = bs.extent(0) - 1;
1357 
1358   PetscFunctionBegin;
1359   // Set the diagonal pointer on device if not already
1360   if (N && akok->diag_dual.extent(0) == 0) {
1361     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1362     akok->SetDiagonal(static_cast<Mat_SeqAIJ *>(A->data)->diag);
1363   }
1364 
1365   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1366 
1367   // Pull out the diagonal blocks of the matrix and then invert the blocks
1368   auto Aa    = akok->a_dual.view_device();
1369   auto Ai    = akok->i_dual.view_device();
1370   auto Aj    = akok->j_dual.view_device();
1371   auto Adiag = akok->diag_dual.view_device();
1372   // TODO: how to tune the team size?
1373 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
1374   auto ts = Kokkos::AUTO();
1375 #else
1376   auto ts         = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1377 #endif
1378   PetscCallCXX(Kokkos::parallel_for(
1379     Kokkos::TeamPolicy<>(nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1380       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1381       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1382       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1383       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1384       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1385 
1386       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1387         PetscInt i = rstart + r;                                                            // i-th row in A
1388 
1389         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1390           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1391 
1392           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1393             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1394               B(r, c) = 0.0;
1395             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1396               B(r, c) = Aa(first + c);
1397             } else { // this entry does not show up in the CSR
1398               B(r, c) = 0.0;
1399             }
1400           }
1401         } else { // rare case that the diagonal does not exist
1402           const PetscInt begin = Ai(i);
1403           const PetscInt end   = Ai(i + 1);
1404           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1405           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1406             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1407             else if (Aj(j) >= rstart + m) break;
1408           }
1409         }
1410       });
1411 
1412       // LU-decompose B (w/o pivoting) and then invert B
1413       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1414       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1415     }));
1416   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1417   PetscFunctionReturn(PETSC_SUCCESS);
1418 }
1419 
1420 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1421 {
1422   Mat_SeqAIJ *aseq;
1423   PetscInt    i, m, n;
1424   auto       &exec = PetscGetKokkosExecutionSpace();
1425 
1426   PetscFunctionBegin;
1427   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1428 
1429   m = akok->nrows();
1430   n = akok->ncols();
1431   PetscCall(MatSetSizes(A, m, n, m, n));
1432   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1433 
1434   /* Set up data structures of A as a MATSEQAIJ */
1435   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1436   aseq = (Mat_SeqAIJ *)(A)->data;
1437 
1438   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1439   PetscCallCXX(akok->j_dual.sync_host(exec));
1440   PetscCallCXX(exec.fence());
1441 
1442   aseq->i            = akok->i_host_data();
1443   aseq->j            = akok->j_host_data();
1444   aseq->a            = akok->a_host_data();
1445   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1446   aseq->singlemalloc = PETSC_FALSE;
1447   aseq->free_a       = PETSC_FALSE;
1448   aseq->free_ij      = PETSC_FALSE;
1449   aseq->nz           = akok->nnz();
1450   aseq->maxnz        = aseq->nz;
1451 
1452   PetscCall(PetscMalloc1(m, &aseq->imax));
1453   PetscCall(PetscMalloc1(m, &aseq->ilen));
1454   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1455 
1456   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1457   akok->nonzerostate = A->nonzerostate;
1458   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1459   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1460   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1461   PetscFunctionReturn(PETSC_SUCCESS);
1462 }
1463 
1464 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1465 {
1466   PetscFunctionBegin;
1467   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1468   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1469   PetscFunctionReturn(PETSC_SUCCESS);
1470 }
1471 
1472 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1473 {
1474   Mat_SeqAIJKokkos *akok;
1475   PetscFunctionBegin;
1476   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1477   PetscCall(MatCreate(comm, A));
1478   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1479   PetscFunctionReturn(PETSC_SUCCESS);
1480 }
1481 
1482 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1483 
1484    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1485  */
1486 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1487 {
1488   PetscFunctionBegin;
1489   PetscCall(MatCreate(comm, A));
1490   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1491   PetscFunctionReturn(PETSC_SUCCESS);
1492 }
1493 
1494 /*@C
1495   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1496   (the default parallel PETSc format). This matrix will ultimately be handled by
1497   Kokkos for calculations.
1498 
1499   Collective
1500 
1501   Input Parameters:
1502 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1503 . m    - number of rows
1504 . n    - number of columns
1505 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1506 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1507 
1508   Output Parameter:
1509 . A - the matrix
1510 
1511   Level: intermediate
1512 
1513   Notes:
1514   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1515   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1516   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1517 
1518   The AIJ format, also called
1519   compressed row storage, is fully compatible with standard Fortran
1520   storage.  That is, the stored row and column indices can begin at
1521   either one (as in Fortran) or zero.
1522 
1523   Specify the preallocated storage with either `nz` or `nnz` (not both).
1524   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1525   allocation.
1526 
1527 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1528 @*/
1529 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1530 {
1531   PetscFunctionBegin;
1532   PetscCall(PetscKokkosInitializeCheck());
1533   PetscCall(MatCreate(comm, A));
1534   PetscCall(MatSetSizes(*A, m, n, m, n));
1535   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1536   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1537   PetscFunctionReturn(PETSC_SUCCESS);
1538 }
1539 
1540 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1541 {
1542   PetscFunctionBegin;
1543   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1544   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1545   PetscFunctionReturn(PETSC_SUCCESS);
1546 }
1547 
1548 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1549 {
1550   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1551 
1552   PetscFunctionBegin;
1553   if (!factors->sptrsv_symbolic_completed) {
1554     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1555     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1556     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1557   }
1558   PetscFunctionReturn(PETSC_SUCCESS);
1559 }
1560 
1561 /* Check if we need to update factors etc for transpose solve */
1562 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1563 {
1564   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1565   MatColIdxType               n       = A->rmap->n;
1566 
1567   PetscFunctionBegin;
1568   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1569     /* Update L^T and do sptrsv symbolic */
1570     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1571     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1572     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1573 
1574     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1575                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1576 
1577     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1578       We have to sort the indices, until KK provides finer control options.
1579     */
1580     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1581 
1582     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1583 
1584     /* Update U^T and do sptrsv symbolic */
1585     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1586     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1587     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1588 
1589     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1590                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1591 
1592     /* Sort indices. See comments above */
1593     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1594 
1595     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1596     factors->transpose_updated = PETSC_TRUE;
1597   }
1598   PetscFunctionReturn(PETSC_SUCCESS);
1599 }
1600 
1601 /* Solve Ax = b, with A = LU */
1602 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1603 {
1604   ConstPetscScalarKokkosView  bv;
1605   PetscScalarKokkosView       xv;
1606   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1607 
1608   PetscFunctionBegin;
1609   PetscCall(PetscLogGpuTimeBegin());
1610   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1611   PetscCall(VecGetKokkosView(b, &bv));
1612   PetscCall(VecGetKokkosViewWrite(x, &xv));
1613   /* Solve L tmpv = b */
1614   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1615   /* Solve Ux = tmpv */
1616   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1617   PetscCall(VecRestoreKokkosView(b, &bv));
1618   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1619   PetscCall(PetscLogGpuTimeEnd());
1620   PetscFunctionReturn(PETSC_SUCCESS);
1621 }
1622 
1623 /* Solve A^T x = b, where A^T = U^T L^T */
1624 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1625 {
1626   ConstPetscScalarKokkosView  bv;
1627   PetscScalarKokkosView       xv;
1628   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1629 
1630   PetscFunctionBegin;
1631   PetscCall(PetscLogGpuTimeBegin());
1632   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1633   PetscCall(VecGetKokkosView(b, &bv));
1634   PetscCall(VecGetKokkosViewWrite(x, &xv));
1635   /* Solve U^T tmpv = b */
1636   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1637 
1638   /* Solve L^T x = tmpv */
1639   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1640   PetscCall(VecRestoreKokkosView(b, &bv));
1641   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1642   PetscCall(PetscLogGpuTimeEnd());
1643   PetscFunctionReturn(PETSC_SUCCESS);
1644 }
1645 
1646 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1647 {
1648   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1649   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1650   PetscInt                    fill_lev = info->levels;
1651 
1652   PetscFunctionBegin;
1653   PetscCall(PetscLogGpuTimeBegin());
1654   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1655 
1656   auto a_d = aijkok->a_dual.view_device();
1657   auto i_d = aijkok->i_dual.view_device();
1658   auto j_d = aijkok->j_dual.view_device();
1659 
1660   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1661 
1662   B->assembled              = PETSC_TRUE;
1663   B->preallocated           = PETSC_TRUE;
1664   B->ops->solve             = MatSolve_SeqAIJKokkos;
1665   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1666   B->ops->matsolve          = NULL;
1667   B->ops->matsolvetranspose = NULL;
1668   B->offloadmask            = PETSC_OFFLOAD_GPU;
1669 
1670   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1671   factors->transpose_updated         = PETSC_FALSE;
1672   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1673   /* TODO: log flops, but how to know that? */
1674   PetscCall(PetscLogGpuTimeEnd());
1675   PetscFunctionReturn(PETSC_SUCCESS);
1676 }
1677 
1678 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1679 {
1680   Mat_SeqAIJKokkos           *aijkok;
1681   Mat_SeqAIJ                 *b;
1682   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1683   PetscInt                    fill_lev = info->levels;
1684   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1685   PetscInt                    n        = A->rmap->n;
1686 
1687   PetscFunctionBegin;
1688   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1689   /* Rebuild factors */
1690   if (factors) {
1691     factors->Destroy();
1692   } /* Destroy the old if it exists */
1693   else {
1694     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1695   }
1696 
1697   /* Create a spiluk handle and then do symbolic factorization */
1698   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1699   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1700 
1701   auto spiluk_handle = factors->kh.get_spiluk_handle();
1702 
1703   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1704   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1705   Kokkos::realloc(factors->iU_d, n + 1);
1706   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1707 
1708   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1709   auto i_d = aijkok->i_dual.view_device();
1710   auto j_d = aijkok->j_dual.view_device();
1711   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1712   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1713 
1714   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1715   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1716   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1717   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1718 
1719   /* TODO: add options to select sptrsv algorithms */
1720   /* Create sptrsv handles for L, U and their transpose */
1721 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1722   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1723 #else
1724   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1725 #endif
1726 
1727   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1728   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1729   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1730   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1731 
1732   /* Fill fields of the factor matrix B */
1733   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1734   b     = (Mat_SeqAIJ *)B->data;
1735   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1736   B->info.fill_ratio_given  = info->fill;
1737   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1738 
1739   B->offloadmask          = PETSC_OFFLOAD_GPU;
1740   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1741   PetscFunctionReturn(PETSC_SUCCESS);
1742 }
1743 
1744 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1745 {
1746   PetscFunctionBegin;
1747   *type = MATSOLVERKOKKOS;
1748   PetscFunctionReturn(PETSC_SUCCESS);
1749 }
1750 
1751 /*MC
1752   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1753   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1754 
1755   Level: beginner
1756 
1757 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1758 M*/
1759 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1760 {
1761   PetscInt n = A->rmap->n;
1762 
1763   PetscFunctionBegin;
1764   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1765   PetscCall(MatSetSizes(*B, n, n, n, n));
1766   (*B)->factortype = ftype;
1767   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1768   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1769 
1770   if (ftype == MAT_FACTOR_LU) {
1771     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1772     (*B)->canuseordering        = PETSC_TRUE;
1773     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1774   } else if (ftype == MAT_FACTOR_ILU) {
1775     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1776     (*B)->canuseordering         = PETSC_FALSE;
1777     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1778   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1779 
1780   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1781   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1782   PetscFunctionReturn(PETSC_SUCCESS);
1783 }
1784 
1785 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1786 {
1787   PetscFunctionBegin;
1788   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1789   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1790   PetscFunctionReturn(PETSC_SUCCESS);
1791 }
1792 
1793 /* Utility to print out a KokkosCsrMatrix for debugging */
1794 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1795 {
1796   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1797   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1798   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1799   const PetscInt    *i  = iv.data();
1800   const PetscInt    *j  = jv.data();
1801   const PetscScalar *a  = av.data();
1802   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1803 
1804   PetscFunctionBegin;
1805   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1806   for (PetscInt k = 0; k < m; k++) {
1807     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1808     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1809     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1810   }
1811   PetscFunctionReturn(PETSC_SUCCESS);
1812 }
1813