xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 3f7878b3ed00daeee6ae8d2839a0d41a4d3e79ca)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20 #include <KokkosSparse_spmv.hpp>
21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22 
23 #include <KokkosSparse_spiluk.hpp>
24 #include <KokkosSparse_sptrsv.hpp>
25 #include <KokkosSparse_spgemm.hpp>
26 #include <KokkosSparse_spadd.hpp>
27 #include <KokkosBatched_LU_Decl.hpp>
28 #include <KokkosBatched_InverseLU_Decl.hpp>
29 
30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31 
32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33   #include <KokkosSparse_Utils.hpp>
34 using KokkosSparse::sort_crs_matrix;
35 using KokkosSparse::Impl::transpose_matrix;
36 #else
37   #include <KokkosKernels_Sorting.hpp>
38 using KokkosKernels::sort_crs_matrix;
39 using KokkosKernels::Impl::transpose_matrix;
40 #endif
41 
42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43 using KokkosSparse::spiluk_symbolic;
44 using KokkosSparse::spiluk_numeric;
45 using KokkosSparse::sptrsv_symbolic;
46 using KokkosSparse::sptrsv_solve;
47 using KokkosSparse::Experimental::SPTRSVAlgorithm;
48 using KokkosSparse::Experimental::SPILUKAlgorithm;
49 #else
50 using KokkosSparse::Experimental::spiluk_symbolic;
51 using KokkosSparse::Experimental::spiluk_numeric;
52 using KokkosSparse::Experimental::sptrsv_symbolic;
53 using KokkosSparse::Experimental::sptrsv_solve;
54 using KokkosSparse::Experimental::SPTRSVAlgorithm;
55 using KokkosSparse::Experimental::SPILUKAlgorithm;
56 #endif
57 
58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59 
60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62    In the latter case, it is important to set a_dual's sync state correctly.
63  */
64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65 {
66   Mat_SeqAIJ       *aijseq;
67   Mat_SeqAIJKokkos *aijkok;
68 
69   PetscFunctionBegin;
70   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72 
73   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75 
76   /* If aijkok does not exist, we just copy i, j to device.
77      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78      In both cases, we build a new aijkok structure.
79   */
80   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
82       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
83       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
84       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
85       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
86       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
87       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
88       aijseq->free_a  = PETSC_TRUE;
89       aijseq->free_ij = PETSC_TRUE;
90       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
91          we have the CsrMatrix on device first and then copy to host, followed by
92          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
93          One could improve it by not using NULL garray.
94       */
95     }
96     delete aijkok;
97     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
98     A->spptr = aijkok;
99   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
100     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
101     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
102     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
103   }
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 
107 /* Sync CSR data to device if not yet */
108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111 
112   PetscFunctionBegin;
113   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
114   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
115   if (aijkok->a_dual.need_sync_device()) {
116     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
117     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
118     aijkok->hermitian_updated = PETSC_FALSE;
119   }
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 /* Mark the CSR data on device as modified */
124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127 
128   PetscFunctionBegin;
129   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
130   aijkok->a_dual.clear_sync_state();
131   aijkok->a_dual.modify_device();
132   aijkok->transpose_updated = PETSC_FALSE;
133   aijkok->hermitian_updated = PETSC_FALSE;
134   PetscCall(MatSeqAIJInvalidateDiagonal(A));
135   PetscCall(PetscObjectStateIncrease((PetscObject)A));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140 {
141   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142   auto              exec   = PetscGetKokkosExecutionSpace();
143 
144   PetscFunctionBegin;
145   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
146   /* We do not expect one needs factors on host  */
147   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
148   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
150   PetscFunctionReturn(PETSC_SUCCESS);
151 }
152 
153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154 {
155   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
156 
157   PetscFunctionBegin;
158   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
159     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
160     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
161     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
162   */
163   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
165     *array = aijkok->a_dual.view_host().data();
166   } else { /* Happens when calling MatSetValues on a newly created matrix */
167     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
168   }
169   PetscFunctionReturn(PETSC_SUCCESS);
170 }
171 
172 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
173 {
174   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
175 
176   PetscFunctionBegin;
177   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
178   PetscFunctionReturn(PETSC_SUCCESS);
179 }
180 
181 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
182 {
183   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
184 
185   PetscFunctionBegin;
186   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
187     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
188     *array = aijkok->a_dual.view_host().data();
189   } else {
190     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
191   }
192   PetscFunctionReturn(PETSC_SUCCESS);
193 }
194 
195 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
196 {
197   PetscFunctionBegin;
198   *array = NULL;
199   PetscFunctionReturn(PETSC_SUCCESS);
200 }
201 
202 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
203 {
204   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
205 
206   PetscFunctionBegin;
207   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
208     *array = aijkok->a_dual.view_host().data();
209   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
210     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
211   }
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 
215 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
216 {
217   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
218 
219   PetscFunctionBegin;
220   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
221     aijkok->a_dual.clear_sync_state();
222     aijkok->a_dual.modify_host();
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
228 {
229   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
230 
231   PetscFunctionBegin;
232   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
233 
234   if (i) *i = aijkok->i_device_data();
235   if (j) *j = aijkok->j_device_data();
236   if (a) {
237     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
238     *a = aijkok->a_device_data();
239   }
240   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243 
244 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
245 {
246   PetscFunctionBegin;
247   *mtype = PETSC_MEMTYPE_KOKKOS;
248   PetscFunctionReturn(PETSC_SUCCESS);
249 }
250 
251 /*
252   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
253 
254   Input Parameter:
255 .  A       - the MATSEQAIJKOKKOS matrix
256 
257   Output Parameters:
258 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
259 -  T_d    - the transpose on device, whose value array is allocated but not initialized
260 */
261 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
262 {
263   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
264   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
265   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
266   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
267   MatRowMapType          *Ti = Ti_h.data();
268   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
269   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
270   PetscInt               *Tj   = Tj_h.data();
271   PetscInt               *perm = perm_h.data();
272   PetscInt               *offset;
273 
274   PetscFunctionBegin;
275   // Populate Ti
276   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
277   Ti++;
278   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
279   Ti--;
280   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
281 
282   // Populate Tj and the permutation array
283   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
284   for (PetscInt i = 0; i < m; i++) {
285     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
286       PetscInt r    = Aj[j];                       // row r of T
287       PetscInt disp = Ti[r] + offset[r];
288 
289       Tj[disp]   = i; // col i of T
290       perm[disp] = j;
291       offset[r]++;
292     }
293   }
294   PetscCall(PetscFree(offset));
295 
296   // Sort each row of T, along with the permutation array
297   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
298 
299   // Output perm and T on device
300   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
301   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
302   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
303   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
304   PetscFunctionReturn(PETSC_SUCCESS);
305 }
306 
307 // Generate the transpose on device and cache it internally
308 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
309 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
310 {
311   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
312   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
313   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
314   KokkosCsrMatrix  &T = akok->csrmatT;
315 
316   PetscFunctionBegin;
317   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
318   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
319 
320   const auto &Aa = akok->a_dual.view_device();
321 
322   if (A->symmetric == PETSC_BOOL3_TRUE) {
323     *csrmatT = akok->csrmat;
324   } else {
325     // See if we already have a cached transpose and its value is up to date
326     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
327       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
328         const auto &perm = akok->transpose_perm; // get the permutation array
329         auto       &Ta   = T.values;
330 
331         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
332       }
333     } else { // Generate T of size n x m for the first time
334       MatRowMapKokkosView perm;
335 
336       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
337       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
338       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
339     }
340     akok->transpose_updated = PETSC_TRUE;
341     *csrmatT                = akok->csrmatT;
342   }
343   PetscFunctionReturn(PETSC_SUCCESS);
344 }
345 
346 // Generate the Hermitian on device and cache it internally
347 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
348 {
349   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
350   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
351   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
352   KokkosCsrMatrix  &T = akok->csrmatH;
353 
354   PetscFunctionBegin;
355   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
356   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
357 
358   const auto &Aa = akok->a_dual.view_device();
359 
360   if (A->hermitian == PETSC_BOOL3_TRUE) {
361     *csrmatH = akok->csrmat;
362   } else {
363     // See if we already have a cached hermitian and its value is up to date
364     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
365       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
366         const auto &perm = akok->transpose_perm; // get the permutation array
367         auto       &Ta   = T.values;
368 
369         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
370       }
371     } else { // Generate T of size n x m for the first time
372       MatRowMapKokkosView perm;
373 
374       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
375       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
376       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
377     }
378     akok->hermitian_updated = PETSC_TRUE;
379     *csrmatH                = akok->csrmatH;
380   }
381   PetscFunctionReturn(PETSC_SUCCESS);
382 }
383 
384 /* y = A x */
385 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
386 {
387   Mat_SeqAIJKokkos          *aijkok;
388   ConstPetscScalarKokkosView xv;
389   PetscScalarKokkosView      yv;
390 
391   PetscFunctionBegin;
392   PetscCall(PetscLogGpuTimeBegin());
393   PetscCall(MatSeqAIJKokkosSyncDevice(A));
394   PetscCall(VecGetKokkosView(xx, &xv));
395   PetscCall(VecGetKokkosViewWrite(yy, &yv));
396   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
397   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
398   PetscCall(VecRestoreKokkosView(xx, &xv));
399   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
400   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
401   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
402   PetscCall(PetscLogGpuTimeEnd());
403   PetscFunctionReturn(PETSC_SUCCESS);
404 }
405 
406 /* y = A^T x */
407 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
408 {
409   Mat_SeqAIJKokkos          *aijkok;
410   const char                *mode;
411   ConstPetscScalarKokkosView xv;
412   PetscScalarKokkosView      yv;
413   KokkosCsrMatrix            csrmat;
414 
415   PetscFunctionBegin;
416   PetscCall(PetscLogGpuTimeBegin());
417   PetscCall(MatSeqAIJKokkosSyncDevice(A));
418   PetscCall(VecGetKokkosView(xx, &xv));
419   PetscCall(VecGetKokkosViewWrite(yy, &yv));
420   if (A->form_explicit_transpose) {
421     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
422     mode = "N";
423   } else {
424     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
425     csrmat = aijkok->csrmat;
426     mode   = "T";
427   }
428   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
429   PetscCall(VecRestoreKokkosView(xx, &xv));
430   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
431   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
432   PetscCall(PetscLogGpuTimeEnd());
433   PetscFunctionReturn(PETSC_SUCCESS);
434 }
435 
436 /* y = A^H x */
437 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
438 {
439   Mat_SeqAIJKokkos          *aijkok;
440   const char                *mode;
441   ConstPetscScalarKokkosView xv;
442   PetscScalarKokkosView      yv;
443   KokkosCsrMatrix            csrmat;
444 
445   PetscFunctionBegin;
446   PetscCall(PetscLogGpuTimeBegin());
447   PetscCall(MatSeqAIJKokkosSyncDevice(A));
448   PetscCall(VecGetKokkosView(xx, &xv));
449   PetscCall(VecGetKokkosViewWrite(yy, &yv));
450   if (A->form_explicit_transpose) {
451     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
452     mode = "N";
453   } else {
454     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
455     csrmat = aijkok->csrmat;
456     mode   = "C";
457   }
458   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
459   PetscCall(VecRestoreKokkosView(xx, &xv));
460   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
461   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
462   PetscCall(PetscLogGpuTimeEnd());
463   PetscFunctionReturn(PETSC_SUCCESS);
464 }
465 
466 /* z = A x + y */
467 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
468 {
469   Mat_SeqAIJKokkos          *aijkok;
470   ConstPetscScalarKokkosView xv;
471   PetscScalarKokkosView      zv;
472 
473   PetscFunctionBegin;
474   PetscCall(PetscLogGpuTimeBegin());
475   PetscCall(MatSeqAIJKokkosSyncDevice(A));
476   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
477   PetscCall(VecGetKokkosView(xx, &xv));
478   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
479   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
480   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
481   PetscCall(VecRestoreKokkosView(xx, &xv));
482   PetscCall(VecRestoreKokkosView(zz, &zv));
483   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
484   PetscCall(PetscLogGpuTimeEnd());
485   PetscFunctionReturn(PETSC_SUCCESS);
486 }
487 
488 /* z = A^T x + y */
489 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
490 {
491   Mat_SeqAIJKokkos          *aijkok;
492   const char                *mode;
493   ConstPetscScalarKokkosView xv;
494   PetscScalarKokkosView      zv;
495   KokkosCsrMatrix            csrmat;
496 
497   PetscFunctionBegin;
498   PetscCall(PetscLogGpuTimeBegin());
499   PetscCall(MatSeqAIJKokkosSyncDevice(A));
500   if (zz != yy) PetscCall(VecCopy(yy, zz));
501   PetscCall(VecGetKokkosView(xx, &xv));
502   PetscCall(VecGetKokkosView(zz, &zv));
503   if (A->form_explicit_transpose) {
504     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
505     mode = "N";
506   } else {
507     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
508     csrmat = aijkok->csrmat;
509     mode   = "T";
510   }
511   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
512   PetscCall(VecRestoreKokkosView(xx, &xv));
513   PetscCall(VecRestoreKokkosView(zz, &zv));
514   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
515   PetscCall(PetscLogGpuTimeEnd());
516   PetscFunctionReturn(PETSC_SUCCESS);
517 }
518 
519 /* z = A^H x + y */
520 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
521 {
522   Mat_SeqAIJKokkos          *aijkok;
523   const char                *mode;
524   ConstPetscScalarKokkosView xv;
525   PetscScalarKokkosView      zv;
526   KokkosCsrMatrix            csrmat;
527 
528   PetscFunctionBegin;
529   PetscCall(PetscLogGpuTimeBegin());
530   PetscCall(MatSeqAIJKokkosSyncDevice(A));
531   if (zz != yy) PetscCall(VecCopy(yy, zz));
532   PetscCall(VecGetKokkosView(xx, &xv));
533   PetscCall(VecGetKokkosView(zz, &zv));
534   if (A->form_explicit_transpose) {
535     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
536     mode = "N";
537   } else {
538     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
539     csrmat = aijkok->csrmat;
540     mode   = "C";
541   }
542   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
543   PetscCall(VecRestoreKokkosView(xx, &xv));
544   PetscCall(VecRestoreKokkosView(zz, &zv));
545   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
546   PetscCall(PetscLogGpuTimeEnd());
547   PetscFunctionReturn(PETSC_SUCCESS);
548 }
549 
550 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
551 {
552   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
553 
554   PetscFunctionBegin;
555   switch (op) {
556   case MAT_FORM_EXPLICIT_TRANSPOSE:
557     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
558     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
559     A->form_explicit_transpose = flg;
560     break;
561   default:
562     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
563     break;
564   }
565   PetscFunctionReturn(PETSC_SUCCESS);
566 }
567 
568 /* Depending on reuse, either build a new mat, or use the existing mat */
569 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
570 {
571   Mat_SeqAIJ *aseq;
572 
573   PetscFunctionBegin;
574   PetscCall(PetscKokkosInitializeCheck());
575   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
576     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
577     PetscCall(MatSetType(*newmat, mtype));
578   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
579     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
580   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
581     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
582     PetscCall(PetscFree(A->defaultvectype));
583     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
584     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
585     PetscCall(MatSetOps_SeqAIJKokkos(A));
586     aseq = static_cast<Mat_SeqAIJ *>(A->data);
587     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
588       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
589       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
590     }
591   }
592   PetscFunctionReturn(PETSC_SUCCESS);
593 }
594 
595 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
596    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
597  */
598 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
599 {
600   Mat_SeqAIJ       *bseq;
601   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
602   Mat               mat;
603 
604   PetscFunctionBegin;
605   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
606   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
607   mat = *B;
608   if (A->assembled) {
609     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
610     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
611     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
612     /* Now copy values to B if needed */
613     if (dupOption == MAT_COPY_VALUES) {
614       if (akok->a_dual.need_sync_device()) {
615         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
616         bkok->a_dual.modify_host();
617       } else { /* If device has the latest data, we only copy data on device */
618         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
619         bkok->a_dual.modify_device();
620       }
621     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
622       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
623       bkok->a_dual.modify_host();
624     }
625     mat->spptr = bkok;
626   }
627 
628   PetscCall(PetscFree(mat->defaultvectype));
629   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
630   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
631   PetscCall(MatSetOps_SeqAIJKokkos(mat));
632   PetscFunctionReturn(PETSC_SUCCESS);
633 }
634 
635 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
636 {
637   Mat               At;
638   KokkosCsrMatrix   internT;
639   Mat_SeqAIJKokkos *atkok, *bkok;
640 
641   PetscFunctionBegin;
642   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
643   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
644   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
645     /* Deep copy internT, as we want to isolate the internal transpose */
646     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
647     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
648     if (reuse == MAT_INITIAL_MATRIX) *B = At;
649     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
650   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
651     if ((*B)->assembled) {
652       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
653       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
654       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
655     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
656       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
657       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
658       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
659       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
660       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
661     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
662   }
663   PetscFunctionReturn(PETSC_SUCCESS);
664 }
665 
666 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
667 {
668   Mat_SeqAIJKokkos *aijkok;
669 
670   PetscFunctionBegin;
671   if (A->factortype == MAT_FACTOR_NONE) {
672     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
673     delete aijkok;
674   } else {
675     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
676   }
677   A->spptr = NULL;
678   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
679   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
680   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
681 #if defined(PETSC_HAVE_HYPRE)
682   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
683 #endif
684   PetscCall(MatDestroy_SeqAIJ(A));
685   PetscFunctionReturn(PETSC_SUCCESS);
686 }
687 
688 /*MC
689    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
690 
691    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
692 
693    Options Database Key:
694 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
695 
696   Level: beginner
697 
698 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
699 M*/
700 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
701 {
702   PetscFunctionBegin;
703   PetscCall(PetscKokkosInitializeCheck());
704   PetscCall(MatCreate_SeqAIJ(A));
705   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
706   PetscFunctionReturn(PETSC_SUCCESS);
707 }
708 
709 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
710 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
711 {
712   Mat_SeqAIJ         *a, *b;
713   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
714   MatScalarKokkosView aa, ba, ca;
715   MatRowMapKokkosView ai, bi, ci;
716   MatColIdxKokkosView aj, bj, cj;
717   PetscInt            m, n, nnz, aN;
718 
719   PetscFunctionBegin;
720   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
721   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
722   PetscAssertPointer(C, 4);
723   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
724   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
725   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
726   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
727 
728   PetscCall(MatSeqAIJKokkosSyncDevice(A));
729   PetscCall(MatSeqAIJKokkosSyncDevice(B));
730   a    = static_cast<Mat_SeqAIJ *>(A->data);
731   b    = static_cast<Mat_SeqAIJ *>(B->data);
732   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
733   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
734   aa   = akok->a_dual.view_device();
735   ai   = akok->i_dual.view_device();
736   ba   = bkok->a_dual.view_device();
737   bi   = bkok->i_dual.view_device();
738   m    = A->rmap->n; /* M, N and nnz of C */
739   n    = A->cmap->n + B->cmap->n;
740   nnz  = a->nz + b->nz;
741   aN   = A->cmap->n; /* N of A */
742   if (reuse == MAT_INITIAL_MATRIX) {
743     aj      = akok->j_dual.view_device();
744     bj      = bkok->j_dual.view_device();
745     auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
746     auto ci = MatRowMapKokkosView("i", ai.extent(0));
747     auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
748 
749     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
750     Kokkos::parallel_for(
751       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
752         PetscInt i       = t.league_rank(); /* row i */
753         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
754 
755         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
756                                                    ci(i) = coffset;
757                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
758         });
759 
760         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
761           if (k < alen) {
762             ca(coffset + k) = aa(ai(i) + k);
763             cj(coffset + k) = aj(ai(i) + k);
764           } else {
765             ca(coffset + k) = ba(bi(i) + k - alen);
766             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
767           }
768         });
769       });
770     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
771     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
772   } else if (reuse == MAT_REUSE_MATRIX) {
773     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
774     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
775     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
776     ca   = ckok->a_dual.view_device();
777     ci   = ckok->i_dual.view_device();
778 
779     Kokkos::parallel_for(
780       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
781         PetscInt i    = t.league_rank(); /* row i */
782         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
783         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
784           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
785           else ca(ci(i) + k) = ba(bi(i) + k - alen);
786         });
787       });
788     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
789   }
790   PetscFunctionReturn(PETSC_SUCCESS);
791 }
792 
793 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
794 {
795   PetscFunctionBegin;
796   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
797   PetscFunctionReturn(PETSC_SUCCESS);
798 }
799 
800 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
801 {
802   Mat_Product                 *product = C->product;
803   Mat                          A, B;
804   bool                         transA, transB; /* use bool, since KK needs this type */
805   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
806   Mat_SeqAIJ                  *c;
807   MatProductData_SeqAIJKokkos *pdata;
808   KokkosCsrMatrix              csrmatA, csrmatB;
809 
810   PetscFunctionBegin;
811   MatCheckProduct(C, 1);
812   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
813   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
814 
815   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
816   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
817   // we still do numeric.
818   if (pdata->reusesym) { // numeric reuses results from symbolic
819     pdata->reusesym = PETSC_FALSE;
820     PetscFunctionReturn(PETSC_SUCCESS);
821   }
822 
823   switch (product->type) {
824   case MATPRODUCT_AB:
825     transA = false;
826     transB = false;
827     break;
828   case MATPRODUCT_AtB:
829     transA = true;
830     transB = false;
831     break;
832   case MATPRODUCT_ABt:
833     transA = false;
834     transB = true;
835     break;
836   default:
837     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
838   }
839 
840   A = product->A;
841   B = product->B;
842   PetscCall(MatSeqAIJKokkosSyncDevice(A));
843   PetscCall(MatSeqAIJKokkosSyncDevice(B));
844   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
845   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
846   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
847 
848   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
849 
850   csrmatA = akok->csrmat;
851   csrmatB = bkok->csrmat;
852 
853   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
854   if (transA) {
855     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
856     transA = false;
857   }
858 
859   if (transB) {
860     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
861     transB = false;
862   }
863   PetscCall(PetscLogGpuTimeBegin());
864   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
865 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
866   auto spgemmHandle = pdata->kh.get_spgemm_handle();
867   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
868 #endif
869 
870   PetscCall(PetscLogGpuTimeEnd());
871   PetscCall(MatSeqAIJKokkosModifyDevice(C));
872   /* shorter version of MatAssemblyEnd_SeqAIJ */
873   c = (Mat_SeqAIJ *)C->data;
874   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
875   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
876   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
877   c->reallocs         = 0;
878   C->info.mallocs     = 0;
879   C->info.nz_unneeded = 0;
880   C->assembled = C->was_assembled = PETSC_TRUE;
881   C->num_ass++;
882   PetscFunctionReturn(PETSC_SUCCESS);
883 }
884 
885 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
886 {
887   Mat_Product                 *product = C->product;
888   MatProductType               ptype;
889   Mat                          A, B;
890   bool                         transA, transB;
891   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
892   MatProductData_SeqAIJKokkos *pdata;
893   MPI_Comm                     comm;
894   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
895 
896   PetscFunctionBegin;
897   MatCheckProduct(C, 1);
898   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
899   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
900   A = product->A;
901   B = product->B;
902   PetscCall(MatSeqAIJKokkosSyncDevice(A));
903   PetscCall(MatSeqAIJKokkosSyncDevice(B));
904   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
905   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
906   csrmatA = akok->csrmat;
907   csrmatB = bkok->csrmat;
908 
909   ptype = product->type;
910   // Take advantage of the symmetry if true
911   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
912     ptype                                          = MATPRODUCT_AB;
913     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
914   }
915   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
916     ptype                                          = MATPRODUCT_AB;
917     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
918   }
919 
920   switch (ptype) {
921   case MATPRODUCT_AB:
922     transA = false;
923     transB = false;
924     PetscCall(MatSetBlockSizesFromMats(C, A, B));
925     break;
926   case MATPRODUCT_AtB:
927     transA = true;
928     transB = false;
929     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
930     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
931     break;
932   case MATPRODUCT_ABt:
933     transA = false;
934     transB = true;
935     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
936     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
937     break;
938   default:
939     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
940   }
941   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
942   pdata->reusesym = product->api_user;
943 
944   /* TODO: add command line options to select spgemm algorithms */
945   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
946 
947   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
948 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
949   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
950   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
951   #endif
952 #endif
953   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
954 
955   PetscCall(PetscLogGpuTimeBegin());
956   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
957   if (transA) {
958     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
959     transA = false;
960   }
961 
962   if (transB) {
963     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
964     transB = false;
965   }
966 
967   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
968   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
969     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
970     calling new Mat_SeqAIJKokkos().
971     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
972   */
973   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
974 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
975   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
976   auto spgemmHandle = pdata->kh.get_spgemm_handle();
977   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
978 #endif
979   PetscCall(PetscLogGpuTimeEnd());
980 
981   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
982   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
983   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
984   PetscFunctionReturn(PETSC_SUCCESS);
985 }
986 
987 /* handles sparse matrix matrix ops */
988 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
989 {
990   Mat_Product *product = mat->product;
991   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
992 
993   PetscFunctionBegin;
994   MatCheckProduct(mat, 1);
995   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
996   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
997   if (Biskok && Ciskok) {
998     switch (product->type) {
999     case MATPRODUCT_AB:
1000     case MATPRODUCT_AtB:
1001     case MATPRODUCT_ABt:
1002       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1003       break;
1004     case MATPRODUCT_PtAP:
1005     case MATPRODUCT_RARt:
1006     case MATPRODUCT_ABC:
1007       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1008       break;
1009     default:
1010       break;
1011     }
1012   } else { /* fallback for AIJ */
1013     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1014   }
1015   PetscFunctionReturn(PETSC_SUCCESS);
1016 }
1017 
1018 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1019 {
1020   Mat_SeqAIJKokkos *aijkok;
1021 
1022   PetscFunctionBegin;
1023   PetscCall(PetscLogGpuTimeBegin());
1024   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1025   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1026   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1027   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1028   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1029   PetscCall(PetscLogGpuTimeEnd());
1030   PetscFunctionReturn(PETSC_SUCCESS);
1031 }
1032 
1033 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1034 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1035 {
1036   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1037 
1038   PetscFunctionBegin;
1039   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1040     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1041 
1042     PetscCall(PetscLogGpuTimeBegin());
1043     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1044     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1045     const auto &Aa     = aijkok->a_dual.view_device();
1046     const auto &Adiag  = aijkok->diag_dual.view_device();
1047     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1048     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1049     PetscCall(PetscLogGpuFlops(n));
1050     PetscCall(PetscLogGpuTimeEnd());
1051   } else { // need reassembly, very slow!
1052     PetscCall(MatShift_Basic(A, a));
1053   }
1054   PetscFunctionReturn(PETSC_SUCCESS);
1055 }
1056 
1057 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1058 {
1059   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1060 
1061   PetscFunctionBegin;
1062   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1063     ConstPetscScalarKokkosView dv;
1064     PetscInt                   n, nv;
1065 
1066     PetscCall(PetscLogGpuTimeBegin());
1067     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1068     PetscCall(VecGetKokkosView(D, &dv));
1069     PetscCall(VecGetLocalSize(D, &nv));
1070     n = PetscMin(Y->rmap->n, Y->cmap->n);
1071     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1072 
1073     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1074     const auto &Aa     = aijkok->a_dual.view_device();
1075     const auto &Adiag  = aijkok->diag_dual.view_device();
1076     PetscCallCXX(Kokkos::parallel_for(
1077       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1078         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1079         else Aa(Adiag(i)) += dv(i);
1080       }));
1081     PetscCall(VecRestoreKokkosView(D, &dv));
1082     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1083     PetscCall(PetscLogGpuFlops(n));
1084     PetscCall(PetscLogGpuTimeEnd());
1085   } else { // need reassembly, very slow!
1086     PetscCall(MatDiagonalSet_Default(Y, D, is));
1087   }
1088   PetscFunctionReturn(PETSC_SUCCESS);
1089 }
1090 
1091 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1092 {
1093   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1094   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1095   ConstPetscScalarKokkosView lv, rv;
1096 
1097   PetscFunctionBegin;
1098   PetscCall(PetscLogGpuTimeBegin());
1099   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1100   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1101   const auto &Aa     = aijkok->a_dual.view_device();
1102   const auto &Ai     = aijkok->i_dual.view_device();
1103   const auto &Aj     = aijkok->j_dual.view_device();
1104   if (ll) {
1105     PetscCall(VecGetLocalSize(ll, &m));
1106     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1107     PetscCall(VecGetKokkosView(ll, &lv));
1108     PetscCallCXX(Kokkos::parallel_for( // for each row
1109       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1110         PetscInt i   = t.league_rank(); // row i
1111         PetscInt len = Ai(i + 1) - Ai(i);
1112         // scale entries on the row
1113         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1114       }));
1115     PetscCall(VecRestoreKokkosView(ll, &lv));
1116     PetscCall(PetscLogGpuFlops(nz));
1117   }
1118   if (rr) {
1119     PetscCall(VecGetLocalSize(rr, &n));
1120     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1121     PetscCall(VecGetKokkosView(rr, &rv));
1122     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1123       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1124     PetscCall(VecRestoreKokkosView(rr, &lv));
1125     PetscCall(PetscLogGpuFlops(nz));
1126   }
1127   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1128   PetscCall(PetscLogGpuTimeEnd());
1129   PetscFunctionReturn(PETSC_SUCCESS);
1130 }
1131 
1132 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1133 {
1134   Mat_SeqAIJKokkos *aijkok;
1135 
1136   PetscFunctionBegin;
1137   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1138   if (aijkok) { /* Only zero the device if data is already there */
1139     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1140     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1141   } else { /* Might be preallocated but not assembled */
1142     PetscCall(MatZeroEntries_SeqAIJ(A));
1143   }
1144   PetscFunctionReturn(PETSC_SUCCESS);
1145 }
1146 
1147 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1148 {
1149   Mat_SeqAIJKokkos     *aijkok;
1150   PetscInt              n;
1151   PetscScalarKokkosView xv;
1152 
1153   PetscFunctionBegin;
1154   PetscCall(VecGetLocalSize(x, &n));
1155   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1156   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1157 
1158   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1159   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1160 
1161   const auto &Aa    = aijkok->a_dual.view_device();
1162   const auto &Ai    = aijkok->i_dual.view_device();
1163   const auto &Adiag = aijkok->diag_dual.view_device();
1164 
1165   PetscCall(VecGetKokkosViewWrite(x, &xv));
1166   Kokkos::parallel_for(
1167     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1168       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1169       else xv(i) = 0;
1170     });
1171   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1172   PetscFunctionReturn(PETSC_SUCCESS);
1173 }
1174 
1175 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1176 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1177 {
1178   Mat_SeqAIJKokkos *aijkok;
1179 
1180   PetscFunctionBegin;
1181   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1182   PetscAssertPointer(kv, 2);
1183   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1184   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1185   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1186   *kv    = aijkok->a_dual.view_device();
1187   PetscFunctionReturn(PETSC_SUCCESS);
1188 }
1189 
1190 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1191 {
1192   PetscFunctionBegin;
1193   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1194   PetscAssertPointer(kv, 2);
1195   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1196   PetscFunctionReturn(PETSC_SUCCESS);
1197 }
1198 
1199 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1200 {
1201   Mat_SeqAIJKokkos *aijkok;
1202 
1203   PetscFunctionBegin;
1204   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1205   PetscAssertPointer(kv, 2);
1206   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1207   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1208   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1209   *kv    = aijkok->a_dual.view_device();
1210   PetscFunctionReturn(PETSC_SUCCESS);
1211 }
1212 
1213 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1214 {
1215   PetscFunctionBegin;
1216   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1217   PetscAssertPointer(kv, 2);
1218   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1219   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1220   PetscFunctionReturn(PETSC_SUCCESS);
1221 }
1222 
1223 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1224 {
1225   Mat_SeqAIJKokkos *aijkok;
1226 
1227   PetscFunctionBegin;
1228   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1229   PetscAssertPointer(kv, 2);
1230   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1231   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1232   *kv    = aijkok->a_dual.view_device();
1233   PetscFunctionReturn(PETSC_SUCCESS);
1234 }
1235 
1236 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1237 {
1238   PetscFunctionBegin;
1239   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1240   PetscAssertPointer(kv, 2);
1241   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1242   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1243   PetscFunctionReturn(PETSC_SUCCESS);
1244 }
1245 
1246 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1247 {
1248   Mat_SeqAIJKokkos *akok;
1249 
1250   PetscFunctionBegin;
1251   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1252   PetscCall(MatCreate(comm, A));
1253   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1254   PetscFunctionReturn(PETSC_SUCCESS);
1255 }
1256 
1257 /* Computes Y += alpha X */
1258 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1259 {
1260   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1261   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1262   ConstMatScalarKokkosView Xa;
1263   MatScalarKokkosView      Ya;
1264   auto                     exec = PetscGetKokkosExecutionSpace();
1265 
1266   PetscFunctionBegin;
1267   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1268   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1269   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1270   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1271   PetscCall(PetscLogGpuTimeBegin());
1272 
1273   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1274     PetscBool e;
1275     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1276     if (e) {
1277       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1278       if (e) pattern = SAME_NONZERO_PATTERN;
1279     }
1280   }
1281 
1282   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1283     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1284     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1285     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1286   */
1287   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1288   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1289   Xa   = xkok->a_dual.view_device();
1290   Ya   = ykok->a_dual.view_device();
1291 
1292   if (pattern == SAME_NONZERO_PATTERN) {
1293     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1294     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1295   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1296     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1297     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1298 
1299     Kokkos::parallel_for(
1300       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1301         PetscInt i = t.league_rank(); // row i
1302         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1303           // Only one thread works in a team
1304           PetscInt p, q = Yi(i);
1305           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1306             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1307             if (Xj(p) == Yj(q)) {                        // Found it
1308               Ya(q) += alpha * Xa(p);
1309               q++;
1310             } else {
1311             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1312             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1313 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1314               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1315 #else
1316               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1317 #endif
1318             }
1319           }
1320         });
1321       });
1322     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1323   } else { // different nonzero patterns
1324     Mat             Z;
1325     KokkosCsrMatrix zcsr;
1326     KernelHandle    kh;
1327     kh.create_spadd_handle(true); // X, Y are sorted
1328     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1329     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1330     zkok = new Mat_SeqAIJKokkos(zcsr);
1331     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1332     PetscCall(MatHeaderReplace(Y, &Z));
1333     kh.destroy_spadd_handle();
1334   }
1335   PetscCall(PetscLogGpuTimeEnd());
1336   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1337   PetscFunctionReturn(PETSC_SUCCESS);
1338 }
1339 
1340 struct MatCOOStruct_SeqAIJKokkos {
1341   PetscCount           n;
1342   PetscCount           Atot;
1343   PetscInt             nz;
1344   PetscCountKokkosView jmap;
1345   PetscCountKokkosView perm;
1346 
1347   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1348   {
1349     nz   = coo_h->nz;
1350     n    = coo_h->n;
1351     Atot = coo_h->Atot;
1352     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1353     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1354   }
1355 };
1356 
1357 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1358 {
1359   PetscFunctionBegin;
1360   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1361   PetscFunctionReturn(PETSC_SUCCESS);
1362 }
1363 
1364 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1365 {
1366   Mat_SeqAIJKokkos          *akok;
1367   Mat_SeqAIJ                *aseq;
1368   PetscContainer             container_h;
1369   MatCOOStruct_SeqAIJ       *coo_h;
1370   MatCOOStruct_SeqAIJKokkos *coo_d;
1371 
1372   PetscFunctionBegin;
1373   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1374   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1375   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1376   delete akok;
1377   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1378   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1379 
1380   // Copy the COO struct to device
1381   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1382   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1383   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1384 
1385   // Put the COO struct in a container and then attach that to the matrix
1386   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1387   PetscFunctionReturn(PETSC_SUCCESS);
1388 }
1389 
1390 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1391 {
1392   MatScalarKokkosView        Aa;
1393   ConstMatScalarKokkosView   kv;
1394   PetscMemType               memtype;
1395   PetscContainer             container;
1396   MatCOOStruct_SeqAIJKokkos *coo;
1397 
1398   PetscFunctionBegin;
1399   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1400   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1401 
1402   const auto &n    = coo->n;
1403   const auto &Annz = coo->nz;
1404   const auto &jmap = coo->jmap;
1405   const auto &perm = coo->perm;
1406 
1407   PetscCall(PetscGetMemType(v, &memtype));
1408   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1409     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1410   } else {
1411     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1412   }
1413 
1414   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1415   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1416 
1417   PetscCall(PetscLogGpuTimeBegin());
1418   Kokkos::parallel_for(
1419     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1420       PetscScalar sum = 0.0;
1421       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1422       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1423     });
1424   PetscCall(PetscLogGpuTimeEnd());
1425 
1426   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1427   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1428   PetscFunctionReturn(PETSC_SUCCESS);
1429 }
1430 
1431 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1432 {
1433   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1434 
1435   PetscFunctionBegin;
1436   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1437   A->boundtocpu  = PETSC_FALSE;
1438 
1439   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1440   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1441   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1442   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1443   A->ops->scale                     = MatScale_SeqAIJKokkos;
1444   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1445   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1446   A->ops->mult                      = MatMult_SeqAIJKokkos;
1447   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1448   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1449   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1450   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1451   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1452   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1453   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1454   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1455   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1456   A->ops->shift                     = MatShift_SeqAIJKokkos;
1457   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1458   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1459   A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJKokkos;
1460   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1461   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1462   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1463   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1464   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1465   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1466   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1467 
1468   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1469   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1470 #if defined(PETSC_HAVE_HYPRE)
1471   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1472 #endif
1473   PetscFunctionReturn(PETSC_SUCCESS);
1474 }
1475 
1476 /*
1477    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1478 
1479   Input Parameters:
1480 +  A       - the MATSEQAIJKOKKOS matrix
1481 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1482 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1483 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1484 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1485 
1486   Output Parameter:
1487 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1488 */
1489 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1490 {
1491   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1492   PetscInt          nblocks = bs.extent(0) - 1;
1493 
1494   PetscFunctionBegin;
1495   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1496 
1497   // Pull out the diagonal blocks of the matrix and then invert the blocks
1498   auto Aa    = akok->a_dual.view_device();
1499   auto Ai    = akok->i_dual.view_device();
1500   auto Aj    = akok->j_dual.view_device();
1501   auto Adiag = akok->diag_dual.view_device();
1502   // TODO: how to tune the team size?
1503 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1504   auto ts = Kokkos::AUTO();
1505 #else
1506   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1507 #endif
1508   PetscCallCXX(Kokkos::parallel_for(
1509     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1510       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1511       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1512       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1513       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1514       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1515 
1516       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1517         PetscInt i = rstart + r;                                                            // i-th row in A
1518 
1519         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1520           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1521 
1522           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1523             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1524               B(r, c) = 0.0;
1525             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1526               B(r, c) = Aa(first + c);
1527             } else { // this entry does not show up in the CSR
1528               B(r, c) = 0.0;
1529             }
1530           }
1531         } else { // rare case that the diagonal does not exist
1532           const PetscInt begin = Ai(i);
1533           const PetscInt end   = Ai(i + 1);
1534           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1535           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1536             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1537             else if (Aj(j) >= rstart + m) break;
1538           }
1539         }
1540       });
1541 
1542       // LU-decompose B (w/o pivoting) and then invert B
1543       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1544       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1545     }));
1546   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1547   PetscFunctionReturn(PETSC_SUCCESS);
1548 }
1549 
1550 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1551 {
1552   Mat_SeqAIJ *aseq;
1553   PetscInt    i, m, n;
1554   auto        exec = PetscGetKokkosExecutionSpace();
1555 
1556   PetscFunctionBegin;
1557   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1558 
1559   m = akok->nrows();
1560   n = akok->ncols();
1561   PetscCall(MatSetSizes(A, m, n, m, n));
1562   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1563 
1564   /* Set up data structures of A as a MATSEQAIJ */
1565   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1566   aseq = (Mat_SeqAIJ *)A->data;
1567 
1568   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1569   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1570 
1571   aseq->i       = akok->i_host_data();
1572   aseq->j       = akok->j_host_data();
1573   aseq->a       = akok->a_host_data();
1574   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1575   aseq->free_a  = PETSC_FALSE;
1576   aseq->free_ij = PETSC_FALSE;
1577   aseq->nz      = akok->nnz();
1578   aseq->maxnz   = aseq->nz;
1579 
1580   PetscCall(PetscMalloc1(m, &aseq->imax));
1581   PetscCall(PetscMalloc1(m, &aseq->ilen));
1582   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1583 
1584   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1585   akok->nonzerostate = A->nonzerostate;
1586   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1587   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1588   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1589   PetscFunctionReturn(PETSC_SUCCESS);
1590 }
1591 
1592 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1593 {
1594   PetscFunctionBegin;
1595   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1596   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1597   PetscFunctionReturn(PETSC_SUCCESS);
1598 }
1599 
1600 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1601 {
1602   Mat_SeqAIJKokkos *akok;
1603 
1604   PetscFunctionBegin;
1605   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1606   PetscCall(MatCreate(comm, A));
1607   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1608   PetscFunctionReturn(PETSC_SUCCESS);
1609 }
1610 
1611 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1612 
1613    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1614  */
1615 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1616 {
1617   PetscFunctionBegin;
1618   PetscCall(MatCreate(comm, A));
1619   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1620   PetscFunctionReturn(PETSC_SUCCESS);
1621 }
1622 
1623 /*@C
1624   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1625   (the default parallel PETSc format). This matrix will ultimately be handled by
1626   Kokkos for calculations.
1627 
1628   Collective
1629 
1630   Input Parameters:
1631 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1632 . m    - number of rows
1633 . n    - number of columns
1634 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1635 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1636 
1637   Output Parameter:
1638 . A - the matrix
1639 
1640   Level: intermediate
1641 
1642   Notes:
1643   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1644   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1645   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1646 
1647   The AIJ format, also called
1648   compressed row storage, is fully compatible with standard Fortran
1649   storage.  That is, the stored row and column indices can begin at
1650   either one (as in Fortran) or zero.
1651 
1652   Specify the preallocated storage with either `nz` or `nnz` (not both).
1653   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1654   allocation.
1655 
1656 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1657 @*/
1658 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1659 {
1660   PetscFunctionBegin;
1661   PetscCall(PetscKokkosInitializeCheck());
1662   PetscCall(MatCreate(comm, A));
1663   PetscCall(MatSetSizes(*A, m, n, m, n));
1664   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1665   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1666   PetscFunctionReturn(PETSC_SUCCESS);
1667 }
1668 
1669 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1670 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1671 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1672 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1673 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1674 {
1675   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1676   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1677   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1678 
1679   PetscFunctionBegin;
1680   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1681     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1682     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1683     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1684   }
1685   PetscFunctionReturn(PETSC_SUCCESS);
1686 }
1687 
1688 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1689 {
1690   const PetscInt              n         = A->rmap->n;
1691   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1692   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1693   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1694 
1695   PetscFunctionBegin;
1696   if (!factors->transpose_updated) {
1697     if (has_upper) {
1698       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1699         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1700         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1701         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1702       }
1703 
1704       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1705         if (!factors->U) {
1706           Mat_SeqAIJ *seq;
1707 
1708           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1709           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1710 
1711           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1712           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1713           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1714           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1715         } else {
1716           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1717         }
1718         // Copy Ut from host to device
1719         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1720         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1721         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1722       } else { // If U was computed on device, we also compute the transpose there
1723         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1724         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1725                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1726                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1727                                                                                                                                                                                                                                factors->aUt_d));
1728         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1729       }
1730       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1731     }
1732 
1733     // do the same for L with LU
1734     if (has_lower) {
1735       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1736         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1737         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1738         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1739       }
1740 
1741       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1742         if (!factors->L) {
1743           Mat_SeqAIJ *seq;
1744 
1745           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1746           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1747 
1748           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1749           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1750           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1751           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1752         } else {
1753           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1754         }
1755         // Copy Lt from host to device
1756         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1757         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1758         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1759       } else { // If L was computed on device, we also compute the transpose there
1760         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1761         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1762                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1763                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1764                                                                                                                                                                                                                                factors->aLt_d));
1765         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1766       }
1767       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1768     }
1769 
1770     factors->transpose_updated = PETSC_TRUE;
1771   }
1772   PetscFunctionReturn(PETSC_SUCCESS);
1773 }
1774 
1775 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1776 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1777 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1778 {
1779   auto                        exec    = PetscGetKokkosExecutionSpace();
1780   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1781   PetscInt                    m       = A->rmap->n;
1782   PetscScalarKokkosView       D       = factors->D_d;
1783   PetscScalarKokkosView       X, Y, B; // alias
1784   ConstPetscScalarKokkosView  b;
1785   PetscScalarKokkosView       x;
1786   PetscIntKokkosView         &rowperm  = factors->rowperm;
1787   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1788 
1789   PetscFunctionBegin;
1790   PetscCall(PetscLogGpuTimeBegin());
1791   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1792   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1793   PetscCall(VecGetKokkosView(bb, &b));
1794   PetscCall(VecGetKokkosViewWrite(xx, &x));
1795 
1796   // Solve U^T Y = B
1797   if (identity) { // Reorder b with the row permutation
1798     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1799     Y = factors->workVector;
1800   } else {
1801     B = factors->workVector;
1802     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1803     Y = x;
1804   }
1805   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1806 
1807   // Solve diag(D) Y' = Y.
1808   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1809   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1810 
1811   // Solve UX = Y
1812   if (identity) {
1813     X = x;
1814   } else {
1815     X = factors->workVector; // B is not needed anymore
1816   }
1817   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1818 
1819   // Reorder X with the inverse column (row) permutation
1820   if (!identity) {
1821     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1822   }
1823 
1824   PetscCall(VecRestoreKokkosView(bb, &b));
1825   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1826   PetscCall(PetscLogGpuTimeEnd());
1827   PetscFunctionReturn(PETSC_SUCCESS);
1828 }
1829 
1830 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1831 // R and C are represented by rowperm and colperm in factors.
1832 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1833 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1834 {
1835   auto                        exec    = PetscGetKokkosExecutionSpace();
1836   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1837   PetscInt                    m       = A->rmap->n;
1838   PetscScalarKokkosView       X, Y, B; // alias
1839   ConstPetscScalarKokkosView  b;
1840   PetscScalarKokkosView       x;
1841   PetscIntKokkosView         &rowperm      = factors->rowperm;
1842   PetscIntKokkosView         &colperm      = factors->colperm;
1843   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1844   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1845 
1846   PetscFunctionBegin;
1847   PetscCall(PetscLogGpuTimeBegin());
1848   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1849   PetscCall(VecGetKokkosView(bb, &b));
1850   PetscCall(VecGetKokkosViewWrite(xx, &x));
1851 
1852   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1853   if (row_identity) {
1854     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1855     Y = factors->workVector;
1856   } else {
1857     B = factors->workVector;
1858     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1859     Y = x;
1860   }
1861   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1862 
1863   // Solve U C^- x = Y
1864   if (col_identity) {
1865     X = x;
1866   } else {
1867     X = factors->workVector;
1868   }
1869   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1870 
1871   // x = C X; Reorder X with the inverse col permutation
1872   if (!col_identity) {
1873     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1874   }
1875 
1876   PetscCall(VecRestoreKokkosView(bb, &b));
1877   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1878   PetscCall(PetscLogGpuTimeEnd());
1879   PetscFunctionReturn(PETSC_SUCCESS);
1880 }
1881 
1882 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1883 // R and C are represented by rowperm and colperm in factors.
1884 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1885 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1886 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1887 {
1888   auto                        exec    = PetscGetKokkosExecutionSpace();
1889   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1890   PetscInt                    m       = A->rmap->n;
1891   PetscScalarKokkosView       X, Y, B; // alias
1892   ConstPetscScalarKokkosView  b;
1893   PetscScalarKokkosView       x;
1894   PetscIntKokkosView         &rowperm      = factors->rowperm;
1895   PetscIntKokkosView         &colperm      = factors->colperm;
1896   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1897   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1898 
1899   PetscFunctionBegin;
1900   PetscCall(PetscLogGpuTimeBegin());
1901   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1902   PetscCall(VecGetKokkosView(bb, &b));
1903   PetscCall(VecGetKokkosViewWrite(xx, &x));
1904 
1905   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1906   if (col_identity) { // Reorder b with the col permutation
1907     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1908     Y = factors->workVector;
1909   } else {
1910     B = factors->workVector;
1911     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1912     Y = x;
1913   }
1914   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1915 
1916   // Solve L^T X = Y
1917   if (row_identity) {
1918     X = x;
1919   } else {
1920     X = factors->workVector;
1921   }
1922   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1923 
1924   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1925   if (!row_identity) {
1926     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1927   }
1928 
1929   PetscCall(VecRestoreKokkosView(bb, &b));
1930   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1931   PetscCall(PetscLogGpuTimeEnd());
1932   PetscFunctionReturn(PETSC_SUCCESS);
1933 }
1934 
1935 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1936 {
1937   PetscFunctionBegin;
1938   PetscCall(MatSeqAIJKokkosSyncHost(A));
1939   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1940 
1941   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1942     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1943     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1944     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1945     const MatScalar            *Ba = b->a;
1946     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1947 
1948     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1949       // Allocate memory and copy the structure
1950       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1951       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1952       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1953       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1954       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1955       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1956 
1957       PetscInt *Li = factors->iL_h.data();
1958       PetscInt *Lj = factors->jL_h.data();
1959       PetscInt *Ui = factors->iU_h.data();
1960       PetscInt *Uj = factors->jU_h.data();
1961 
1962       Li[0] = Ui[0] = 0;
1963       for (PetscInt i = 0; i < m; i++) {
1964         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1965         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1966 
1967         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1968         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1969 
1970         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1971         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1972 
1973         Li[i + 1] = Li[i] + llen + 1;
1974         Ui[i + 1] = Ui[i] + ulen;
1975       }
1976 
1977       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1978       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1979       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1980       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1981       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1982       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1983 
1984       // Copy row/col permutation to device
1985       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1986       PetscBool row_identity;
1987       PetscCall(ISIdentity(rowperm, &row_identity));
1988       if (!row_identity) {
1989         const PetscInt *ip;
1990 
1991         PetscCall(ISGetIndices(rowperm, &ip));
1992         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1993         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1994         PetscCall(ISRestoreIndices(rowperm, &ip));
1995         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1996       }
1997 
1998       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1999       PetscBool col_identity;
2000       PetscCall(ISIdentity(colperm, &col_identity));
2001       if (!col_identity) {
2002         const PetscInt *ip;
2003 
2004         PetscCall(ISGetIndices(colperm, &ip));
2005         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2006         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2007         PetscCall(ISRestoreIndices(colperm, &ip));
2008         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2009       }
2010 
2011       /* Create sptrsv handles for L, U and their transpose */
2012 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2013       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2014 #else
2015       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2016 #endif
2017       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2018       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2019       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2020       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2021     }
2022 
2023     // Copy the value
2024     for (PetscInt i = 0; i < m; i++) {
2025       PetscInt        llen = Bi[i + 1] - Bi[i];
2026       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2027       const PetscInt *Li   = factors->iL_h.data();
2028       const PetscInt *Ui   = factors->iU_h.data();
2029 
2030       PetscScalar *La = factors->aL_h.data();
2031       PetscScalar *Ua = factors->aU_h.data();
2032 
2033       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2034       La[Li[i] + llen] = 1.0;                      // diagonal entry
2035 
2036       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2037       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2038     }
2039 
2040     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2041     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2042     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2043     factors->transpose_updated         = PETSC_FALSE;
2044     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2045 
2046     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2047     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2048   }
2049 
2050   B->ops->matsolve          = NULL;
2051   B->ops->matsolvetranspose = NULL;
2052   PetscFunctionReturn(PETSC_SUCCESS);
2053 }
2054 
2055 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2056 {
2057   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2058   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2059   PetscInt                    fill_lev = info->levels;
2060 
2061   PetscFunctionBegin;
2062   PetscCall(PetscLogGpuTimeBegin());
2063   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2064   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2065 
2066   auto a_d = aijkok->a_dual.view_device();
2067   auto i_d = aijkok->i_dual.view_device();
2068   auto j_d = aijkok->j_dual.view_device();
2069 
2070   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2071 
2072   B->assembled              = PETSC_TRUE;
2073   B->preallocated           = PETSC_TRUE;
2074   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2075   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2076   B->ops->matsolve          = NULL;
2077   B->ops->matsolvetranspose = NULL;
2078 
2079   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2080   factors->transpose_updated         = PETSC_FALSE;
2081   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2082   /* TODO: log flops, but how to know that? */
2083   PetscCall(PetscLogGpuTimeEnd());
2084   PetscFunctionReturn(PETSC_SUCCESS);
2085 }
2086 
2087 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2088 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2089 {
2090   Mat_SeqAIJKokkos           *aijkok;
2091   Mat_SeqAIJ                 *b;
2092   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2093   PetscInt                    fill_lev = info->levels;
2094   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2095   PetscInt                    n        = A->rmap->n;
2096 
2097   PetscFunctionBegin;
2098   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2099   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2100 
2101   /* Create a spiluk handle and then do symbolic factorization */
2102   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2103   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2104 
2105   auto spiluk_handle = factors->kh.get_spiluk_handle();
2106 
2107   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2108   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2109   Kokkos::realloc(factors->iU_d, n + 1);
2110   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2111 
2112   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2113   auto i_d = aijkok->i_dual.view_device();
2114   auto j_d = aijkok->j_dual.view_device();
2115   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2116   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2117 
2118   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2119   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2120   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2121   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2122 
2123   /* TODO: add options to select sptrsv algorithms */
2124   /* Create sptrsv handles for L, U and their transpose */
2125 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2126   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2127 #else
2128   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2129 #endif
2130 
2131   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2132   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2133   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2134   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2135 
2136   /* Fill fields of the factor matrix B */
2137   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2138   b     = (Mat_SeqAIJ *)B->data;
2139   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2140   B->info.fill_ratio_given  = info->fill;
2141   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2142 
2143   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2144   PetscFunctionReturn(PETSC_SUCCESS);
2145 }
2146 
2147 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2148 {
2149   PetscFunctionBegin;
2150   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2151   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2152   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2153   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2154   PetscFunctionReturn(PETSC_SUCCESS);
2155 }
2156 
2157 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2158 {
2159   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2160 
2161   PetscFunctionBegin;
2162   if (!info->factoronhost) {
2163     PetscCall(ISIdentity(isrow, &row_identity));
2164     PetscCall(ISIdentity(iscol, &col_identity));
2165   }
2166 
2167   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2168   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2169 
2170   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2171     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2172   } else {
2173     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2174     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2175   }
2176   PetscFunctionReturn(PETSC_SUCCESS);
2177 }
2178 
2179 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2180 {
2181   PetscFunctionBegin;
2182   PetscCall(MatSeqAIJKokkosSyncHost(A));
2183   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2184 
2185   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2186     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2187     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2188     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2189     const MatScalar            *Ba = b->a;
2190     PetscInt                    m  = B->rmap->n;
2191 
2192     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2193       // Allocate memory and copy the structure
2194       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2195       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2196       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2197       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2198       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2199       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2200 
2201       // Build jU_h from the skewed Aj
2202       PetscInt *Uj = factors->jU_h.data();
2203       for (PetscInt i = 0; i < m; i++) {
2204         PetscInt ulen = Bi[i + 1] - Bi[i];
2205         Uj[Bi[i]]     = i;                                              // diagonal entry
2206         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2207       }
2208 
2209       // Copy iU, jU to device
2210       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2211       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2212 
2213       // Copy row/col permutation to device
2214       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2215       PetscBool row_identity;
2216       PetscCall(ISIdentity(rowperm, &row_identity));
2217       if (!row_identity) {
2218         const PetscInt *ip;
2219 
2220         PetscCall(ISGetIndices(rowperm, &ip));
2221         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2222         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2223         PetscCall(ISRestoreIndices(rowperm, &ip));
2224         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2225       }
2226 
2227       // Create sptrsv handles for U and U^T
2228 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2229       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2230 #else
2231       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2232 #endif
2233       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2234       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2235     }
2236     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2237     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2238     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2239 
2240     // Copy the value
2241     PetscScalar *Ua = factors->aU_h.data();
2242     PetscScalar *D  = factors->D_h.data();
2243     for (PetscInt i = 0; i < m; i++) {
2244       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2245       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2246       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2247     }
2248     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2249     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2250 
2251     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2252     factors->transpose_updated         = PETSC_FALSE;
2253   }
2254 
2255   B->ops->matsolve          = NULL;
2256   B->ops->matsolvetranspose = NULL;
2257   PetscFunctionReturn(PETSC_SUCCESS);
2258 }
2259 
2260 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2261 {
2262   PetscFunctionBegin;
2263   if (info->solveonhost) {
2264     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2265     PetscCall(MatSetType(B, MATSEQSBAIJ));
2266     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2267   }
2268 
2269   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2270 
2271   if (!info->solveonhost) {
2272     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2273     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2274     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2275     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2276   }
2277   PetscFunctionReturn(PETSC_SUCCESS);
2278 }
2279 
2280 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2281 {
2282   PetscFunctionBegin;
2283   if (info->solveonhost) {
2284     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2285     PetscCall(MatSetType(B, MATSEQSBAIJ));
2286     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2287   }
2288 
2289   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2290 
2291   if (!info->solveonhost) {
2292     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2293     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2294     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2295     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2296   }
2297   PetscFunctionReturn(PETSC_SUCCESS);
2298 }
2299 
2300 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2301 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2302 {
2303   PetscFunctionBegin;
2304   *type = MATSOLVERKOKKOS;
2305   PetscFunctionReturn(PETSC_SUCCESS);
2306 }
2307 
2308 /*MC
2309   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2310   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2311 
2312   Level: beginner
2313 
2314 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2315 M*/
2316 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2317 {
2318   PetscInt n = A->rmap->n;
2319   MPI_Comm comm;
2320 
2321   PetscFunctionBegin;
2322   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2323   PetscCall(MatCreate(comm, B));
2324   PetscCall(MatSetSizes(*B, n, n, n, n));
2325   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2326   (*B)->factortype = ftype;
2327   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2328   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2329   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2330 
2331   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2332     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2333     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2334     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2335     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2336     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2337   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2338     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2339     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2340     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2341     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2342   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2343 
2344   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2345   (*B)->canuseordering = PETSC_TRUE;
2346   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2347   PetscFunctionReturn(PETSC_SUCCESS);
2348 }
2349 
2350 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2351 {
2352   PetscFunctionBegin;
2353   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2354   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2355   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2356   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2357   PetscFunctionReturn(PETSC_SUCCESS);
2358 }
2359 
2360 /* Utility to print out a KokkosCsrMatrix for debugging */
2361 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2362 {
2363   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2364   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2365   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2366   const PetscInt    *i  = iv.data();
2367   const PetscInt    *j  = jv.data();
2368   const PetscScalar *a  = av.data();
2369   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2370 
2371   PetscFunctionBegin;
2372   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2373   for (PetscInt k = 0; k < m; k++) {
2374     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2375     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2376     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2377   }
2378   PetscFunctionReturn(PETSC_SUCCESS);
2379 }
2380