xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision d2dffbf663ccc3d78d082b483cae224715b9dfa9)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20 #include <KokkosSparse_spmv.hpp>
21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22 
23 #include <KokkosSparse_spiluk.hpp>
24 #include <KokkosSparse_sptrsv.hpp>
25 #include <KokkosSparse_spgemm.hpp>
26 #include <KokkosSparse_spadd.hpp>
27 #include <KokkosBatched_LU_Decl.hpp>
28 #include <KokkosBatched_InverseLU_Decl.hpp>
29 
30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31 
32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33   #include <KokkosSparse_Utils.hpp>
34 using KokkosSparse::sort_crs_matrix;
35 using KokkosSparse::Impl::transpose_matrix;
36 #else
37   #include <KokkosKernels_Sorting.hpp>
38 using KokkosKernels::sort_crs_matrix;
39 using KokkosKernels::Impl::transpose_matrix;
40 #endif
41 
42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43 using KokkosSparse::spiluk_symbolic;
44 using KokkosSparse::spiluk_numeric;
45 using KokkosSparse::sptrsv_symbolic;
46 using KokkosSparse::sptrsv_solve;
47 using KokkosSparse::Experimental::SPTRSVAlgorithm;
48 using KokkosSparse::Experimental::SPILUKAlgorithm;
49 #else
50 using KokkosSparse::Experimental::spiluk_symbolic;
51 using KokkosSparse::Experimental::spiluk_numeric;
52 using KokkosSparse::Experimental::sptrsv_symbolic;
53 using KokkosSparse::Experimental::sptrsv_solve;
54 using KokkosSparse::Experimental::SPTRSVAlgorithm;
55 using KokkosSparse::Experimental::SPILUKAlgorithm;
56 #endif
57 
58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59 
60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62    In the latter case, it is important to set a_dual's sync state correctly.
63  */
64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65 {
66   Mat_SeqAIJ       *aijseq;
67   Mat_SeqAIJKokkos *aijkok;
68 
69   PetscFunctionBegin;
70   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72 
73   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75 
76   /* If aijkok does not exist, we just copy i, j to device.
77      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78      In both cases, we build a new aijkok structure.
79   */
80   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidently freeing much needed a,i,j on host when deleting aijkok */
82       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
83       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
84       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
85       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
86       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
87       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
88       aijseq->free_a  = PETSC_TRUE;
89       aijseq->free_ij = PETSC_TRUE;
90       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
91          we have the CsrMatrix on device first and then copy to host, followed by
92          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
93          One could improve it by not using NULL garray.
94       */
95     }
96     delete aijkok;
97     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
98     A->spptr = aijkok;
99   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
100     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
101     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
102     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
103   }
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 
107 /* Sync CSR data to device if not yet */
108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111 
112   PetscFunctionBegin;
113   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
114   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
115   if (aijkok->a_dual.need_sync_device()) {
116     aijkok->a_dual.sync_device();
117     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
118     aijkok->hermitian_updated = PETSC_FALSE;
119   }
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 /* Mark the CSR data on device as modified */
124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127 
128   PetscFunctionBegin;
129   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
130   aijkok->a_dual.clear_sync_state();
131   aijkok->a_dual.modify_device();
132   aijkok->transpose_updated = PETSC_FALSE;
133   aijkok->hermitian_updated = PETSC_FALSE;
134   PetscCall(MatSeqAIJInvalidateDiagonal(A));
135   PetscCall(PetscObjectStateIncrease((PetscObject)A));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140 {
141   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142   auto              exec   = PetscGetKokkosExecutionSpace();
143 
144   PetscFunctionBegin;
145   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
146   /* We do not expect one needs factors on host  */
147   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
148   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149   PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(aijkok->a_dual, exec));
150   PetscFunctionReturn(PETSC_SUCCESS);
151 }
152 
153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154 {
155   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
156 
157   PetscFunctionBegin;
158   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
159     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
160     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
161     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
162   */
163   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164     auto exec = PetscGetKokkosExecutionSpace();
165     PetscCallCXX(aijkok->a_dual.sync_host(exec));
166     PetscCallCXX(exec.fence());
167     *array = aijkok->a_dual.view_host().data();
168   } else { /* Happens when calling MatSetValues on a newly created matrix */
169     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
170   }
171   PetscFunctionReturn(PETSC_SUCCESS);
172 }
173 
174 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
175 {
176   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
177 
178   PetscFunctionBegin;
179   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
180   PetscFunctionReturn(PETSC_SUCCESS);
181 }
182 
183 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
184 {
185   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
186 
187   PetscFunctionBegin;
188   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
189     auto exec = PetscGetKokkosExecutionSpace();
190     PetscCallCXX(aijkok->a_dual.sync_host(exec));
191     PetscCallCXX(exec.fence());
192     *array = aijkok->a_dual.view_host().data();
193   } else {
194     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
195   }
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
200 {
201   PetscFunctionBegin;
202   *array = NULL;
203   PetscFunctionReturn(PETSC_SUCCESS);
204 }
205 
206 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
207 {
208   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
209 
210   PetscFunctionBegin;
211   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
212     *array = aijkok->a_dual.view_host().data();
213   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
214     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
215   }
216   PetscFunctionReturn(PETSC_SUCCESS);
217 }
218 
219 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
220 {
221   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
222 
223   PetscFunctionBegin;
224   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
225     aijkok->a_dual.clear_sync_state();
226     aijkok->a_dual.modify_host();
227   }
228   PetscFunctionReturn(PETSC_SUCCESS);
229 }
230 
231 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
232 {
233   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
234 
235   PetscFunctionBegin;
236   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
237 
238   if (i) *i = aijkok->i_device_data();
239   if (j) *j = aijkok->j_device_data();
240   if (a) {
241     aijkok->a_dual.sync_device();
242     *a = aijkok->a_device_data();
243   }
244   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
245   PetscFunctionReturn(PETSC_SUCCESS);
246 }
247 
248 /*
249   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
250 
251   Input Parameter:
252 .  A       - the MATSEQAIJKOKKOS matrix
253 
254   Output Parameters:
255 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
256 -  T_d    - the transpose on device, whose value array is allocated but not initialized
257 */
258 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
259 {
260   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
261   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
262   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
263   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
264   MatRowMapType          *Ti = Ti_h.data();
265   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
266   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
267   PetscInt               *Tj   = Tj_h.data();
268   PetscInt               *perm = perm_h.data();
269   PetscInt               *offset;
270 
271   PetscFunctionBegin;
272   // Populate Ti
273   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
274   Ti++;
275   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
276   Ti--;
277   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
278 
279   // Populate Tj and the permutation array
280   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
281   for (PetscInt i = 0; i < m; i++) {
282     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
283       PetscInt r    = Aj[j];                       // row r of T
284       PetscInt disp = Ti[r] + offset[r];
285 
286       Tj[disp]   = i; // col i of T
287       perm[disp] = j;
288       offset[r]++;
289     }
290   }
291   PetscCall(PetscFree(offset));
292 
293   // Sort each row of T, along with the permutation array
294   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
295 
296   // Output perm and T on device
297   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
298   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
299   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
300   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
301   PetscFunctionReturn(PETSC_SUCCESS);
302 }
303 
304 // Generate the transpose on device and cache it internally
305 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
306 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
307 {
308   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
309   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
310   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
311   KokkosCsrMatrix  &T = akok->csrmatT;
312 
313   PetscFunctionBegin;
314   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
315   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
316 
317   const auto &Aa = akok->a_dual.view_device();
318 
319   if (A->symmetric == PETSC_BOOL3_TRUE) {
320     *csrmatT = akok->csrmat;
321   } else {
322     // See if we already have a cached transpose and its value is up to date
323     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
324       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
325         const auto &perm = akok->transpose_perm; // get the permutation array
326         auto       &Ta   = T.values;
327 
328         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
329       }
330     } else { // Generate T of size n x m for the first time
331       MatRowMapKokkosView perm;
332 
333       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
334       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
335       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
336     }
337     akok->transpose_updated = PETSC_TRUE;
338     *csrmatT                = akok->csrmatT;
339   }
340   PetscFunctionReturn(PETSC_SUCCESS);
341 }
342 
343 // Generate the Hermitian on device and cache it internally
344 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
345 {
346   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
347   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
348   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
349   KokkosCsrMatrix  &T = akok->csrmatH;
350 
351   PetscFunctionBegin;
352   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
353   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
354 
355   const auto &Aa = akok->a_dual.view_device();
356 
357   if (A->hermitian == PETSC_BOOL3_TRUE) {
358     *csrmatH = akok->csrmat;
359   } else {
360     // See if we already have a cached hermitian and its value is up to date
361     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
362       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
363         const auto &perm = akok->transpose_perm; // get the permutation array
364         auto       &Ta   = T.values;
365 
366         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
367       }
368     } else { // Generate T of size n x m for the first time
369       MatRowMapKokkosView perm;
370 
371       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
372       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
373       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
374     }
375     akok->hermitian_updated = PETSC_TRUE;
376     *csrmatH                = akok->csrmatH;
377   }
378   PetscFunctionReturn(PETSC_SUCCESS);
379 }
380 
381 /* y = A x */
382 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
383 {
384   Mat_SeqAIJKokkos          *aijkok;
385   ConstPetscScalarKokkosView xv;
386   PetscScalarKokkosView      yv;
387 
388   PetscFunctionBegin;
389   PetscCall(PetscLogGpuTimeBegin());
390   PetscCall(MatSeqAIJKokkosSyncDevice(A));
391   PetscCall(VecGetKokkosView(xx, &xv));
392   PetscCall(VecGetKokkosViewWrite(yy, &yv));
393   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
394   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
395   PetscCall(VecRestoreKokkosView(xx, &xv));
396   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
397   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
398   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
399   PetscCall(PetscLogGpuTimeEnd());
400   PetscFunctionReturn(PETSC_SUCCESS);
401 }
402 
403 /* y = A^T x */
404 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
405 {
406   Mat_SeqAIJKokkos          *aijkok;
407   const char                *mode;
408   ConstPetscScalarKokkosView xv;
409   PetscScalarKokkosView      yv;
410   KokkosCsrMatrix            csrmat;
411 
412   PetscFunctionBegin;
413   PetscCall(PetscLogGpuTimeBegin());
414   PetscCall(MatSeqAIJKokkosSyncDevice(A));
415   PetscCall(VecGetKokkosView(xx, &xv));
416   PetscCall(VecGetKokkosViewWrite(yy, &yv));
417   if (A->form_explicit_transpose) {
418     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
419     mode = "N";
420   } else {
421     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
422     csrmat = aijkok->csrmat;
423     mode   = "T";
424   }
425   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
426   PetscCall(VecRestoreKokkosView(xx, &xv));
427   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
428   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
429   PetscCall(PetscLogGpuTimeEnd());
430   PetscFunctionReturn(PETSC_SUCCESS);
431 }
432 
433 /* y = A^H x */
434 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
435 {
436   Mat_SeqAIJKokkos          *aijkok;
437   const char                *mode;
438   ConstPetscScalarKokkosView xv;
439   PetscScalarKokkosView      yv;
440   KokkosCsrMatrix            csrmat;
441 
442   PetscFunctionBegin;
443   PetscCall(PetscLogGpuTimeBegin());
444   PetscCall(MatSeqAIJKokkosSyncDevice(A));
445   PetscCall(VecGetKokkosView(xx, &xv));
446   PetscCall(VecGetKokkosViewWrite(yy, &yv));
447   if (A->form_explicit_transpose) {
448     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
449     mode = "N";
450   } else {
451     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
452     csrmat = aijkok->csrmat;
453     mode   = "C";
454   }
455   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
456   PetscCall(VecRestoreKokkosView(xx, &xv));
457   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
458   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
459   PetscCall(PetscLogGpuTimeEnd());
460   PetscFunctionReturn(PETSC_SUCCESS);
461 }
462 
463 /* z = A x + y */
464 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
465 {
466   Mat_SeqAIJKokkos          *aijkok;
467   ConstPetscScalarKokkosView xv;
468   PetscScalarKokkosView      zv;
469 
470   PetscFunctionBegin;
471   PetscCall(PetscLogGpuTimeBegin());
472   PetscCall(MatSeqAIJKokkosSyncDevice(A));
473   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
474   PetscCall(VecGetKokkosView(xx, &xv));
475   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
476   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
477   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
478   PetscCall(VecRestoreKokkosView(xx, &xv));
479   PetscCall(VecRestoreKokkosView(zz, &zv));
480   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
481   PetscCall(PetscLogGpuTimeEnd());
482   PetscFunctionReturn(PETSC_SUCCESS);
483 }
484 
485 /* z = A^T x + y */
486 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
487 {
488   Mat_SeqAIJKokkos          *aijkok;
489   const char                *mode;
490   ConstPetscScalarKokkosView xv;
491   PetscScalarKokkosView      zv;
492   KokkosCsrMatrix            csrmat;
493 
494   PetscFunctionBegin;
495   PetscCall(PetscLogGpuTimeBegin());
496   PetscCall(MatSeqAIJKokkosSyncDevice(A));
497   if (zz != yy) PetscCall(VecCopy(yy, zz));
498   PetscCall(VecGetKokkosView(xx, &xv));
499   PetscCall(VecGetKokkosView(zz, &zv));
500   if (A->form_explicit_transpose) {
501     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
502     mode = "N";
503   } else {
504     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
505     csrmat = aijkok->csrmat;
506     mode   = "T";
507   }
508   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
509   PetscCall(VecRestoreKokkosView(xx, &xv));
510   PetscCall(VecRestoreKokkosView(zz, &zv));
511   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
512   PetscCall(PetscLogGpuTimeEnd());
513   PetscFunctionReturn(PETSC_SUCCESS);
514 }
515 
516 /* z = A^H x + y */
517 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
518 {
519   Mat_SeqAIJKokkos          *aijkok;
520   const char                *mode;
521   ConstPetscScalarKokkosView xv;
522   PetscScalarKokkosView      zv;
523   KokkosCsrMatrix            csrmat;
524 
525   PetscFunctionBegin;
526   PetscCall(PetscLogGpuTimeBegin());
527   PetscCall(MatSeqAIJKokkosSyncDevice(A));
528   if (zz != yy) PetscCall(VecCopy(yy, zz));
529   PetscCall(VecGetKokkosView(xx, &xv));
530   PetscCall(VecGetKokkosView(zz, &zv));
531   if (A->form_explicit_transpose) {
532     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
533     mode = "N";
534   } else {
535     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
536     csrmat = aijkok->csrmat;
537     mode   = "C";
538   }
539   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
540   PetscCall(VecRestoreKokkosView(xx, &xv));
541   PetscCall(VecRestoreKokkosView(zz, &zv));
542   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
543   PetscCall(PetscLogGpuTimeEnd());
544   PetscFunctionReturn(PETSC_SUCCESS);
545 }
546 
547 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
548 {
549   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
550 
551   PetscFunctionBegin;
552   switch (op) {
553   case MAT_FORM_EXPLICIT_TRANSPOSE:
554     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
555     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
556     A->form_explicit_transpose = flg;
557     break;
558   default:
559     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
560     break;
561   }
562   PetscFunctionReturn(PETSC_SUCCESS);
563 }
564 
565 /* Depending on reuse, either build a new mat, or use the existing mat */
566 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
567 {
568   Mat_SeqAIJ *aseq;
569 
570   PetscFunctionBegin;
571   PetscCall(PetscKokkosInitializeCheck());
572   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
573     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
574     PetscCall(MatSetType(*newmat, mtype));
575   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
576     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
577   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
578     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
579     PetscCall(PetscFree(A->defaultvectype));
580     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
581     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
582     PetscCall(MatSetOps_SeqAIJKokkos(A));
583     aseq = static_cast<Mat_SeqAIJ *>(A->data);
584     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
585       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
586       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
587     }
588   }
589   PetscFunctionReturn(PETSC_SUCCESS);
590 }
591 
592 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
593    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
594  */
595 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
596 {
597   Mat_SeqAIJ       *bseq;
598   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
599   Mat               mat;
600 
601   PetscFunctionBegin;
602   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
603   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
604   mat = *B;
605   if (A->assembled) {
606     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
607     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
608     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
609     /* Now copy values to B if needed */
610     if (dupOption == MAT_COPY_VALUES) {
611       if (akok->a_dual.need_sync_device()) {
612         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
613         bkok->a_dual.modify_host();
614       } else { /* If device has the latest data, we only copy data on device */
615         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
616         bkok->a_dual.modify_device();
617       }
618     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
619       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
620       bkok->a_dual.modify_host();
621     }
622     mat->spptr = bkok;
623   }
624 
625   PetscCall(PetscFree(mat->defaultvectype));
626   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
627   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
628   PetscCall(MatSetOps_SeqAIJKokkos(mat));
629   PetscFunctionReturn(PETSC_SUCCESS);
630 }
631 
632 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
633 {
634   Mat               At;
635   KokkosCsrMatrix   internT;
636   Mat_SeqAIJKokkos *atkok, *bkok;
637 
638   PetscFunctionBegin;
639   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
640   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
641   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
642     /* Deep copy internT, as we want to isolate the internal transpose */
643     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
644     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
645     if (reuse == MAT_INITIAL_MATRIX) *B = At;
646     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
647   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
648     if ((*B)->assembled) {
649       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
650       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
651       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
652     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
653       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
654       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
655       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
656       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
657       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
658     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
659   }
660   PetscFunctionReturn(PETSC_SUCCESS);
661 }
662 
663 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
664 {
665   Mat_SeqAIJKokkos *aijkok;
666 
667   PetscFunctionBegin;
668   if (A->factortype == MAT_FACTOR_NONE) {
669     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
670     delete aijkok;
671   } else {
672     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
673   }
674   A->spptr = NULL;
675   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
676   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
677   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
678 #if defined(PETSC_HAVE_HYPRE)
679   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
680 #endif
681   PetscCall(MatDestroy_SeqAIJ(A));
682   PetscFunctionReturn(PETSC_SUCCESS);
683 }
684 
685 /*MC
686    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
687 
688    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
689 
690    Options Database Key:
691 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
692 
693   Level: beginner
694 
695 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
696 M*/
697 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
698 {
699   PetscFunctionBegin;
700   PetscCall(PetscKokkosInitializeCheck());
701   PetscCall(MatCreate_SeqAIJ(A));
702   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
703   PetscFunctionReturn(PETSC_SUCCESS);
704 }
705 
706 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
707 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
708 {
709   Mat_SeqAIJ         *a, *b;
710   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
711   MatScalarKokkosView aa, ba, ca;
712   MatRowMapKokkosView ai, bi, ci;
713   MatColIdxKokkosView aj, bj, cj;
714   PetscInt            m, n, nnz, aN;
715 
716   PetscFunctionBegin;
717   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
718   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
719   PetscAssertPointer(C, 4);
720   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
721   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
722   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
723   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
724 
725   PetscCall(MatSeqAIJKokkosSyncDevice(A));
726   PetscCall(MatSeqAIJKokkosSyncDevice(B));
727   a    = static_cast<Mat_SeqAIJ *>(A->data);
728   b    = static_cast<Mat_SeqAIJ *>(B->data);
729   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
730   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
731   aa   = akok->a_dual.view_device();
732   ai   = akok->i_dual.view_device();
733   ba   = bkok->a_dual.view_device();
734   bi   = bkok->i_dual.view_device();
735   m    = A->rmap->n; /* M, N and nnz of C */
736   n    = A->cmap->n + B->cmap->n;
737   nnz  = a->nz + b->nz;
738   aN   = A->cmap->n; /* N of A */
739   if (reuse == MAT_INITIAL_MATRIX) {
740     aj           = akok->j_dual.view_device();
741     bj           = bkok->j_dual.view_device();
742     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
743     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
744     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
745     ca           = ca_dual.view_device();
746     ci           = ci_dual.view_device();
747     cj           = cj_dual.view_device();
748 
749     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
750     Kokkos::parallel_for(
751       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
752         PetscInt i       = t.league_rank(); /* row i */
753         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
754 
755         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
756                                                    ci(i) = coffset;
757                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
758         });
759 
760         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
761           if (k < alen) {
762             ca(coffset + k) = aa(ai(i) + k);
763             cj(coffset + k) = aj(ai(i) + k);
764           } else {
765             ca(coffset + k) = ba(bi(i) + k - alen);
766             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
767           }
768         });
769       });
770     ca_dual.modify_device();
771     ci_dual.modify_device();
772     cj_dual.modify_device();
773     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
774     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
775   } else if (reuse == MAT_REUSE_MATRIX) {
776     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
777     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
778     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
779     ca   = ckok->a_dual.view_device();
780     ci   = ckok->i_dual.view_device();
781 
782     Kokkos::parallel_for(
783       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
784         PetscInt i    = t.league_rank(); /* row i */
785         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
786         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
787           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
788           else ca(ci(i) + k) = ba(bi(i) + k - alen);
789         });
790       });
791     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
792   }
793   PetscFunctionReturn(PETSC_SUCCESS);
794 }
795 
796 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
797 {
798   PetscFunctionBegin;
799   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
800   PetscFunctionReturn(PETSC_SUCCESS);
801 }
802 
803 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
804 {
805   Mat_Product                 *product = C->product;
806   Mat                          A, B;
807   bool                         transA, transB; /* use bool, since KK needs this type */
808   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
809   Mat_SeqAIJ                  *c;
810   MatProductData_SeqAIJKokkos *pdata;
811   KokkosCsrMatrix              csrmatA, csrmatB;
812 
813   PetscFunctionBegin;
814   MatCheckProduct(C, 1);
815   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
816   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
817 
818   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
819   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
820   // we still do numeric.
821   if (pdata->reusesym) { // numeric reuses results from symbolic
822     pdata->reusesym = PETSC_FALSE;
823     PetscFunctionReturn(PETSC_SUCCESS);
824   }
825 
826   switch (product->type) {
827   case MATPRODUCT_AB:
828     transA = false;
829     transB = false;
830     break;
831   case MATPRODUCT_AtB:
832     transA = true;
833     transB = false;
834     break;
835   case MATPRODUCT_ABt:
836     transA = false;
837     transB = true;
838     break;
839   default:
840     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
841   }
842 
843   A = product->A;
844   B = product->B;
845   PetscCall(MatSeqAIJKokkosSyncDevice(A));
846   PetscCall(MatSeqAIJKokkosSyncDevice(B));
847   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
848   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
849   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
850 
851   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
852 
853   csrmatA = akok->csrmat;
854   csrmatB = bkok->csrmat;
855 
856   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
857   if (transA) {
858     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
859     transA = false;
860   }
861 
862   if (transB) {
863     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
864     transB = false;
865   }
866   PetscCall(PetscLogGpuTimeBegin());
867   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
868 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
869   auto spgemmHandle = pdata->kh.get_spgemm_handle();
870   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
871 #endif
872 
873   PetscCall(PetscLogGpuTimeEnd());
874   PetscCall(MatSeqAIJKokkosModifyDevice(C));
875   /* shorter version of MatAssemblyEnd_SeqAIJ */
876   c = (Mat_SeqAIJ *)C->data;
877   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
878   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
879   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
880   c->reallocs         = 0;
881   C->info.mallocs     = 0;
882   C->info.nz_unneeded = 0;
883   C->assembled = C->was_assembled = PETSC_TRUE;
884   C->num_ass++;
885   PetscFunctionReturn(PETSC_SUCCESS);
886 }
887 
888 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
889 {
890   Mat_Product                 *product = C->product;
891   MatProductType               ptype;
892   Mat                          A, B;
893   bool                         transA, transB;
894   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
895   MatProductData_SeqAIJKokkos *pdata;
896   MPI_Comm                     comm;
897   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
898 
899   PetscFunctionBegin;
900   MatCheckProduct(C, 1);
901   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
902   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
903   A = product->A;
904   B = product->B;
905   PetscCall(MatSeqAIJKokkosSyncDevice(A));
906   PetscCall(MatSeqAIJKokkosSyncDevice(B));
907   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
908   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
909   csrmatA = akok->csrmat;
910   csrmatB = bkok->csrmat;
911 
912   ptype = product->type;
913   // Take advantage of the symmetry if true
914   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
915     ptype                                          = MATPRODUCT_AB;
916     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
917   }
918   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
919     ptype                                          = MATPRODUCT_AB;
920     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
921   }
922 
923   switch (ptype) {
924   case MATPRODUCT_AB:
925     transA = false;
926     transB = false;
927     PetscCall(MatSetBlockSizesFromMats(C, A, B));
928     break;
929   case MATPRODUCT_AtB:
930     transA = true;
931     transB = false;
932     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
933     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
934     break;
935   case MATPRODUCT_ABt:
936     transA = false;
937     transB = true;
938     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
939     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
940     break;
941   default:
942     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
943   }
944   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
945   pdata->reusesym = product->api_user;
946 
947   /* TODO: add command line options to select spgemm algorithms */
948   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
949 
950   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
951 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
952   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
953   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
954   #endif
955 #endif
956   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
957 
958   PetscCall(PetscLogGpuTimeBegin());
959   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
960   if (transA) {
961     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
962     transA = false;
963   }
964 
965   if (transB) {
966     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
967     transB = false;
968   }
969 
970   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
971   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
972     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
973     calling new Mat_SeqAIJKokkos().
974     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
975   */
976   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
977 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
978   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
979   auto spgemmHandle = pdata->kh.get_spgemm_handle();
980   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
981 #endif
982   PetscCall(PetscLogGpuTimeEnd());
983 
984   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
985   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
986   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
987   PetscFunctionReturn(PETSC_SUCCESS);
988 }
989 
990 /* handles sparse matrix matrix ops */
991 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
992 {
993   Mat_Product *product = mat->product;
994   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
995 
996   PetscFunctionBegin;
997   MatCheckProduct(mat, 1);
998   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
999   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
1000   if (Biskok && Ciskok) {
1001     switch (product->type) {
1002     case MATPRODUCT_AB:
1003     case MATPRODUCT_AtB:
1004     case MATPRODUCT_ABt:
1005       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1006       break;
1007     case MATPRODUCT_PtAP:
1008     case MATPRODUCT_RARt:
1009     case MATPRODUCT_ABC:
1010       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1011       break;
1012     default:
1013       break;
1014     }
1015   } else { /* fallback for AIJ */
1016     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1017   }
1018   PetscFunctionReturn(PETSC_SUCCESS);
1019 }
1020 
1021 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1022 {
1023   Mat_SeqAIJKokkos *aijkok;
1024 
1025   PetscFunctionBegin;
1026   PetscCall(PetscLogGpuTimeBegin());
1027   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1028   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1029   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1030   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1031   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1032   PetscCall(PetscLogGpuTimeEnd());
1033   PetscFunctionReturn(PETSC_SUCCESS);
1034 }
1035 
1036 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1037 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1038 {
1039   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1040 
1041   PetscFunctionBegin;
1042   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1043     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1044 
1045     PetscCall(PetscLogGpuTimeBegin());
1046     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1047     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1048     const auto &Aa     = aijkok->a_dual.view_device();
1049     const auto &Adiag  = aijkok->diag_dual.view_device();
1050     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1051     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1052     PetscCall(PetscLogGpuFlops(n));
1053     PetscCall(PetscLogGpuTimeEnd());
1054   } else { // need reassembly, very slow!
1055     PetscCall(MatShift_Basic(A, a));
1056   }
1057   PetscFunctionReturn(PETSC_SUCCESS);
1058 }
1059 
1060 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1061 {
1062   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1063 
1064   PetscFunctionBegin;
1065   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1066     ConstPetscScalarKokkosView dv;
1067     PetscInt                   n, nv;
1068 
1069     PetscCall(PetscLogGpuTimeBegin());
1070     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1071     PetscCall(VecGetKokkosView(D, &dv));
1072     PetscCall(VecGetLocalSize(D, &nv));
1073     n = PetscMin(Y->rmap->n, Y->cmap->n);
1074     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1075 
1076     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1077     const auto &Aa     = aijkok->a_dual.view_device();
1078     const auto &Adiag  = aijkok->diag_dual.view_device();
1079     PetscCallCXX(Kokkos::parallel_for(
1080       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1081         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1082         else Aa(Adiag(i)) += dv(i);
1083       }));
1084     PetscCall(VecRestoreKokkosView(D, &dv));
1085     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1086     PetscCall(PetscLogGpuFlops(n));
1087     PetscCall(PetscLogGpuTimeEnd());
1088   } else { // need reassembly, very slow!
1089     PetscCall(MatDiagonalSet_Default(Y, D, is));
1090   }
1091   PetscFunctionReturn(PETSC_SUCCESS);
1092 }
1093 
1094 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1095 {
1096   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1097   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1098   ConstPetscScalarKokkosView lv, rv;
1099 
1100   PetscFunctionBegin;
1101   PetscCall(PetscLogGpuTimeBegin());
1102   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1103   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1104   const auto &Aa     = aijkok->a_dual.view_device();
1105   const auto &Ai     = aijkok->i_dual.view_device();
1106   const auto &Aj     = aijkok->j_dual.view_device();
1107   if (ll) {
1108     PetscCall(VecGetLocalSize(ll, &m));
1109     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1110     PetscCall(VecGetKokkosView(ll, &lv));
1111     PetscCallCXX(Kokkos::parallel_for( // for each row
1112       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1113         PetscInt i   = t.league_rank(); // row i
1114         PetscInt len = Ai(i + 1) - Ai(i);
1115         // scale entries on the row
1116         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1117       }));
1118     PetscCall(VecRestoreKokkosView(ll, &lv));
1119     PetscCall(PetscLogGpuFlops(nz));
1120   }
1121   if (rr) {
1122     PetscCall(VecGetLocalSize(rr, &n));
1123     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1124     PetscCall(VecGetKokkosView(rr, &rv));
1125     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1126       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1127     PetscCall(VecRestoreKokkosView(rr, &lv));
1128     PetscCall(PetscLogGpuFlops(nz));
1129   }
1130   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1131   PetscCall(PetscLogGpuTimeEnd());
1132   PetscFunctionReturn(PETSC_SUCCESS);
1133 }
1134 
1135 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1136 {
1137   Mat_SeqAIJKokkos *aijkok;
1138 
1139   PetscFunctionBegin;
1140   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1141   if (aijkok) { /* Only zero the device if data is already there */
1142     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1143     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1144   } else { /* Might be preallocated but not assembled */
1145     PetscCall(MatZeroEntries_SeqAIJ(A));
1146   }
1147   PetscFunctionReturn(PETSC_SUCCESS);
1148 }
1149 
1150 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1151 {
1152   Mat_SeqAIJKokkos     *aijkok;
1153   PetscInt              n;
1154   PetscScalarKokkosView xv;
1155 
1156   PetscFunctionBegin;
1157   PetscCall(VecGetLocalSize(x, &n));
1158   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1159   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1160 
1161   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1162   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1163 
1164   const auto &Aa    = aijkok->a_dual.view_device();
1165   const auto &Ai    = aijkok->i_dual.view_device();
1166   const auto &Adiag = aijkok->diag_dual.view_device();
1167 
1168   PetscCall(VecGetKokkosViewWrite(x, &xv));
1169   Kokkos::parallel_for(
1170     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1171       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1172       else xv(i) = 0;
1173     });
1174   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1175   PetscFunctionReturn(PETSC_SUCCESS);
1176 }
1177 
1178 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1179 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1180 {
1181   Mat_SeqAIJKokkos *aijkok;
1182 
1183   PetscFunctionBegin;
1184   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1185   PetscAssertPointer(kv, 2);
1186   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1187   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1188   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1189   *kv    = aijkok->a_dual.view_device();
1190   PetscFunctionReturn(PETSC_SUCCESS);
1191 }
1192 
1193 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1194 {
1195   PetscFunctionBegin;
1196   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1197   PetscAssertPointer(kv, 2);
1198   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1199   PetscFunctionReturn(PETSC_SUCCESS);
1200 }
1201 
1202 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1203 {
1204   Mat_SeqAIJKokkos *aijkok;
1205 
1206   PetscFunctionBegin;
1207   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1208   PetscAssertPointer(kv, 2);
1209   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1210   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1211   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1212   *kv    = aijkok->a_dual.view_device();
1213   PetscFunctionReturn(PETSC_SUCCESS);
1214 }
1215 
1216 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1217 {
1218   PetscFunctionBegin;
1219   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1220   PetscAssertPointer(kv, 2);
1221   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1222   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1223   PetscFunctionReturn(PETSC_SUCCESS);
1224 }
1225 
1226 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1227 {
1228   Mat_SeqAIJKokkos *aijkok;
1229 
1230   PetscFunctionBegin;
1231   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1232   PetscAssertPointer(kv, 2);
1233   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1234   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1235   *kv    = aijkok->a_dual.view_device();
1236   PetscFunctionReturn(PETSC_SUCCESS);
1237 }
1238 
1239 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1240 {
1241   PetscFunctionBegin;
1242   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1243   PetscAssertPointer(kv, 2);
1244   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1245   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1246   PetscFunctionReturn(PETSC_SUCCESS);
1247 }
1248 
1249 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1250 {
1251   Mat_SeqAIJKokkos *akok;
1252 
1253   PetscFunctionBegin;
1254   // Create host copies of the input aij
1255   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1256   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1257   // Don't copy the vals to the host now
1258   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1259 
1260   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1261   // Note we have modified device data so it will copy lazily
1262   a_dual.modify_device();
1263   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1264   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1265 
1266   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1267   PetscCall(MatCreate(comm, A));
1268   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1269   PetscFunctionReturn(PETSC_SUCCESS);
1270 }
1271 
1272 /* Computes Y += alpha X */
1273 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1274 {
1275   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1276   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1277   ConstMatScalarKokkosView Xa;
1278   MatScalarKokkosView      Ya;
1279   auto                     exec = PetscGetKokkosExecutionSpace();
1280 
1281   PetscFunctionBegin;
1282   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1283   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1284   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1285   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1286   PetscCall(PetscLogGpuTimeBegin());
1287 
1288   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1289     PetscBool e;
1290     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1291     if (e) {
1292       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1293       if (e) pattern = SAME_NONZERO_PATTERN;
1294     }
1295   }
1296 
1297   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1298     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1299     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1300     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1301   */
1302   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1303   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1304   Xa   = xkok->a_dual.view_device();
1305   Ya   = ykok->a_dual.view_device();
1306 
1307   if (pattern == SAME_NONZERO_PATTERN) {
1308     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1309     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1310   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1311     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1312     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1313 
1314     Kokkos::parallel_for(
1315       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1316         PetscInt i = t.league_rank(); // row i
1317         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1318           // Only one thread works in a team
1319           PetscInt p, q = Yi(i);
1320           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1321             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1322             if (Xj(p) == Yj(q)) {                        // Found it
1323               Ya(q) += alpha * Xa(p);
1324               q++;
1325             } else {
1326             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1327             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1328 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1329               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1330 #else
1331               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1332 #endif
1333             }
1334           }
1335         });
1336       });
1337     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1338   } else { // different nonzero patterns
1339     Mat             Z;
1340     KokkosCsrMatrix zcsr;
1341     KernelHandle    kh;
1342     kh.create_spadd_handle(true); // X, Y are sorted
1343     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1344     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1345     zkok = new Mat_SeqAIJKokkos(zcsr);
1346     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1347     PetscCall(MatHeaderReplace(Y, &Z));
1348     kh.destroy_spadd_handle();
1349   }
1350   PetscCall(PetscLogGpuTimeEnd());
1351   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1352   PetscFunctionReturn(PETSC_SUCCESS);
1353 }
1354 
1355 struct MatCOOStruct_SeqAIJKokkos {
1356   PetscCount           n;
1357   PetscCount           Atot;
1358   PetscInt             nz;
1359   PetscCountKokkosView jmap;
1360   PetscCountKokkosView perm;
1361 
1362   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1363   {
1364     nz   = coo_h->nz;
1365     n    = coo_h->n;
1366     Atot = coo_h->Atot;
1367     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1368     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1369   }
1370 };
1371 
1372 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1373 {
1374   PetscFunctionBegin;
1375   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1376   PetscFunctionReturn(PETSC_SUCCESS);
1377 }
1378 
1379 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1380 {
1381   Mat_SeqAIJKokkos          *akok;
1382   Mat_SeqAIJ                *aseq;
1383   PetscContainer             container_h;
1384   MatCOOStruct_SeqAIJ       *coo_h;
1385   MatCOOStruct_SeqAIJKokkos *coo_d;
1386 
1387   PetscFunctionBegin;
1388   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1389   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1390   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1391   delete akok;
1392   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1393   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1394 
1395   // Copy the COO struct to device
1396   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1397   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1398   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1399 
1400   // Put the COO struct in a container and then attach that to the matrix
1401   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1402   PetscFunctionReturn(PETSC_SUCCESS);
1403 }
1404 
1405 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1406 {
1407   MatScalarKokkosView        Aa;
1408   ConstMatScalarKokkosView   kv;
1409   PetscMemType               memtype;
1410   PetscContainer             container;
1411   MatCOOStruct_SeqAIJKokkos *coo;
1412 
1413   PetscFunctionBegin;
1414   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1415   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1416 
1417   const auto &n    = coo->n;
1418   const auto &Annz = coo->nz;
1419   const auto &jmap = coo->jmap;
1420   const auto &perm = coo->perm;
1421 
1422   PetscCall(PetscGetMemType(v, &memtype));
1423   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1424     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1425   } else {
1426     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1427   }
1428 
1429   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1430   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1431 
1432   PetscCall(PetscLogGpuTimeBegin());
1433   Kokkos::parallel_for(
1434     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1435       PetscScalar sum = 0.0;
1436       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1437       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1438     });
1439   PetscCall(PetscLogGpuTimeEnd());
1440 
1441   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1442   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1443   PetscFunctionReturn(PETSC_SUCCESS);
1444 }
1445 
1446 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1447 {
1448   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1449 
1450   PetscFunctionBegin;
1451   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1452   A->boundtocpu  = PETSC_FALSE;
1453 
1454   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1455   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1456   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1457   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1458   A->ops->scale                     = MatScale_SeqAIJKokkos;
1459   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1460   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1461   A->ops->mult                      = MatMult_SeqAIJKokkos;
1462   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1463   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1464   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1465   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1466   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1467   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1468   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1469   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1470   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1471   A->ops->shift                     = MatShift_SeqAIJKokkos;
1472   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1473   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1474   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1475   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1476   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1477   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1478   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1479   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1480   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1481 
1482   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1483   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1484 #if defined(PETSC_HAVE_HYPRE)
1485   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1486 #endif
1487   PetscFunctionReturn(PETSC_SUCCESS);
1488 }
1489 
1490 /*
1491    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1492 
1493   Input Parameters:
1494 +  A       - the MATSEQAIJKOKKOS matrix
1495 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1496 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1497 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1498 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1499 
1500   Output Parameter:
1501 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1502 */
1503 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1504 {
1505   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1506   PetscInt          nblocks = bs.extent(0) - 1;
1507 
1508   PetscFunctionBegin;
1509   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1510 
1511   // Pull out the diagonal blocks of the matrix and then invert the blocks
1512   auto Aa    = akok->a_dual.view_device();
1513   auto Ai    = akok->i_dual.view_device();
1514   auto Aj    = akok->j_dual.view_device();
1515   auto Adiag = akok->diag_dual.view_device();
1516   // TODO: how to tune the team size?
1517 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1518   auto ts = Kokkos::AUTO();
1519 #else
1520   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1521 #endif
1522   PetscCallCXX(Kokkos::parallel_for(
1523     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1524       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1525       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1526       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1527       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1528       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1529 
1530       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1531         PetscInt i = rstart + r;                                                            // i-th row in A
1532 
1533         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1534           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1535 
1536           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1537             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1538               B(r, c) = 0.0;
1539             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1540               B(r, c) = Aa(first + c);
1541             } else { // this entry does not show up in the CSR
1542               B(r, c) = 0.0;
1543             }
1544           }
1545         } else { // rare case that the diagonal does not exist
1546           const PetscInt begin = Ai(i);
1547           const PetscInt end   = Ai(i + 1);
1548           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1549           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1550             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1551             else if (Aj(j) >= rstart + m) break;
1552           }
1553         }
1554       });
1555 
1556       // LU-decompose B (w/o pivoting) and then invert B
1557       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1558       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1559     }));
1560   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1561   PetscFunctionReturn(PETSC_SUCCESS);
1562 }
1563 
1564 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1565 {
1566   Mat_SeqAIJ *aseq;
1567   PetscInt    i, m, n;
1568   auto        exec = PetscGetKokkosExecutionSpace();
1569 
1570   PetscFunctionBegin;
1571   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1572 
1573   m = akok->nrows();
1574   n = akok->ncols();
1575   PetscCall(MatSetSizes(A, m, n, m, n));
1576   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1577 
1578   /* Set up data structures of A as a MATSEQAIJ */
1579   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1580   aseq = (Mat_SeqAIJ *)A->data;
1581 
1582   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1583   PetscCallCXX(akok->j_dual.sync_host(exec));
1584   PetscCallCXX(exec.fence());
1585 
1586   aseq->i       = akok->i_host_data();
1587   aseq->j       = akok->j_host_data();
1588   aseq->a       = akok->a_host_data();
1589   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1590   aseq->free_a  = PETSC_FALSE;
1591   aseq->free_ij = PETSC_FALSE;
1592   aseq->nz      = akok->nnz();
1593   aseq->maxnz   = aseq->nz;
1594 
1595   PetscCall(PetscMalloc1(m, &aseq->imax));
1596   PetscCall(PetscMalloc1(m, &aseq->ilen));
1597   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1598 
1599   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1600   akok->nonzerostate = A->nonzerostate;
1601   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1602   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1603   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1604   PetscFunctionReturn(PETSC_SUCCESS);
1605 }
1606 
1607 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1608 {
1609   PetscFunctionBegin;
1610   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1611   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1612   PetscFunctionReturn(PETSC_SUCCESS);
1613 }
1614 
1615 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1616 {
1617   Mat_SeqAIJKokkos *akok;
1618 
1619   PetscFunctionBegin;
1620   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1621   PetscCall(MatCreate(comm, A));
1622   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1623   PetscFunctionReturn(PETSC_SUCCESS);
1624 }
1625 
1626 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1627 
1628    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1629  */
1630 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1631 {
1632   PetscFunctionBegin;
1633   PetscCall(MatCreate(comm, A));
1634   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1635   PetscFunctionReturn(PETSC_SUCCESS);
1636 }
1637 
1638 /*@C
1639   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1640   (the default parallel PETSc format). This matrix will ultimately be handled by
1641   Kokkos for calculations.
1642 
1643   Collective
1644 
1645   Input Parameters:
1646 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1647 . m    - number of rows
1648 . n    - number of columns
1649 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1650 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1651 
1652   Output Parameter:
1653 . A - the matrix
1654 
1655   Level: intermediate
1656 
1657   Notes:
1658   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1659   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1660   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1661 
1662   The AIJ format, also called
1663   compressed row storage, is fully compatible with standard Fortran
1664   storage.  That is, the stored row and column indices can begin at
1665   either one (as in Fortran) or zero.
1666 
1667   Specify the preallocated storage with either `nz` or `nnz` (not both).
1668   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1669   allocation.
1670 
1671 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1672 @*/
1673 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1674 {
1675   PetscFunctionBegin;
1676   PetscCall(PetscKokkosInitializeCheck());
1677   PetscCall(MatCreate(comm, A));
1678   PetscCall(MatSetSizes(*A, m, n, m, n));
1679   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1680   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1681   PetscFunctionReturn(PETSC_SUCCESS);
1682 }
1683 
1684 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1685 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1686 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1687 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1688 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1689 {
1690   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1691   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1692   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1693 
1694   PetscFunctionBegin;
1695   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1696     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1697     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1698     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1699   }
1700   PetscFunctionReturn(PETSC_SUCCESS);
1701 }
1702 
1703 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1704 {
1705   const PetscInt              n         = A->rmap->n;
1706   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1707   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1708   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1709 
1710   PetscFunctionBegin;
1711   if (!factors->transpose_updated) {
1712     if (has_upper) {
1713       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1714         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1715         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1716         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1717       }
1718 
1719       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1720         if (!factors->U) {
1721           Mat_SeqAIJ *seq;
1722 
1723           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1724           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1725 
1726           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1727           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1728           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1729           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1730         } else {
1731           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1732         }
1733         // Copy Ut from host to device
1734         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1735         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1736         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1737       } else { // If U was computed on device, we also compute the transpose there
1738         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1739         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1740                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1741                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1742                                                                                                                                                                                                                                factors->aUt_d));
1743         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1744       }
1745       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1746     }
1747 
1748     // do the same for L with LU
1749     if (has_lower) {
1750       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1751         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1752         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1753         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1754       }
1755 
1756       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1757         if (!factors->L) {
1758           Mat_SeqAIJ *seq;
1759 
1760           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1761           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1762 
1763           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1764           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1765           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1766           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1767         } else {
1768           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1769         }
1770         // Copy Lt from host to device
1771         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1772         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1773         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1774       } else { // If L was computed on device, we also compute the transpose there
1775         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1776         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1777                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1778                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1779                                                                                                                                                                                                                                factors->aLt_d));
1780         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1781       }
1782       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1783     }
1784 
1785     factors->transpose_updated = PETSC_TRUE;
1786   }
1787   PetscFunctionReturn(PETSC_SUCCESS);
1788 }
1789 
1790 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1791 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1792 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1793 {
1794   auto                        exec    = PetscGetKokkosExecutionSpace();
1795   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1796   PetscInt                    m       = A->rmap->n;
1797   PetscScalarKokkosView       D       = factors->D_d;
1798   PetscScalarKokkosView       X, Y, B; // alias
1799   ConstPetscScalarKokkosView  b;
1800   PetscScalarKokkosView       x;
1801   PetscIntKokkosView         &rowperm  = factors->rowperm;
1802   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1803 
1804   PetscFunctionBegin;
1805   PetscCall(PetscLogGpuTimeBegin());
1806   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1807   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1808   PetscCall(VecGetKokkosView(bb, &b));
1809   PetscCall(VecGetKokkosViewWrite(xx, &x));
1810 
1811   // Solve U^T Y = B
1812   if (identity) { // Reorder b with the row permutation
1813     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1814     Y = factors->workVector;
1815   } else {
1816     B = factors->workVector;
1817     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1818     Y = x;
1819   }
1820   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1821 
1822   // Solve diag(D) Y' = Y.
1823   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1824   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1825 
1826   // Solve UX = Y
1827   if (identity) {
1828     X = x;
1829   } else {
1830     X = factors->workVector; // B is not needed anymore
1831   }
1832   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1833 
1834   // Reorder X with the inverse column (row) permutation
1835   if (!identity) {
1836     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1837   }
1838 
1839   PetscCall(VecRestoreKokkosView(bb, &b));
1840   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1841   PetscCall(PetscLogGpuTimeEnd());
1842   PetscFunctionReturn(PETSC_SUCCESS);
1843 }
1844 
1845 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1846 // R and C are represented by rowperm and colperm in factors.
1847 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1848 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1849 {
1850   auto                        exec    = PetscGetKokkosExecutionSpace();
1851   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1852   PetscInt                    m       = A->rmap->n;
1853   PetscScalarKokkosView       X, Y, B; // alias
1854   ConstPetscScalarKokkosView  b;
1855   PetscScalarKokkosView       x;
1856   PetscIntKokkosView         &rowperm      = factors->rowperm;
1857   PetscIntKokkosView         &colperm      = factors->colperm;
1858   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1859   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1860 
1861   PetscFunctionBegin;
1862   PetscCall(PetscLogGpuTimeBegin());
1863   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1864   PetscCall(VecGetKokkosView(bb, &b));
1865   PetscCall(VecGetKokkosViewWrite(xx, &x));
1866 
1867   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1868   if (row_identity) {
1869     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1870     Y = factors->workVector;
1871   } else {
1872     B = factors->workVector;
1873     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1874     Y = x;
1875   }
1876   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1877 
1878   // Solve U C^- x = Y
1879   if (col_identity) {
1880     X = x;
1881   } else {
1882     X = factors->workVector;
1883   }
1884   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1885 
1886   // x = C X; Reorder X with the inverse col permutation
1887   if (!col_identity) {
1888     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1889   }
1890 
1891   PetscCall(VecRestoreKokkosView(bb, &b));
1892   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1893   PetscCall(PetscLogGpuTimeEnd());
1894   PetscFunctionReturn(PETSC_SUCCESS);
1895 }
1896 
1897 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1898 // R and C are represented by rowperm and colperm in factors.
1899 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1900 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1901 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1902 {
1903   auto                        exec    = PetscGetKokkosExecutionSpace();
1904   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1905   PetscInt                    m       = A->rmap->n;
1906   PetscScalarKokkosView       X, Y, B; // alias
1907   ConstPetscScalarKokkosView  b;
1908   PetscScalarKokkosView       x;
1909   PetscIntKokkosView         &rowperm      = factors->rowperm;
1910   PetscIntKokkosView         &colperm      = factors->colperm;
1911   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1912   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1913 
1914   PetscFunctionBegin;
1915   PetscCall(PetscLogGpuTimeBegin());
1916   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1917   PetscCall(VecGetKokkosView(bb, &b));
1918   PetscCall(VecGetKokkosViewWrite(xx, &x));
1919 
1920   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1921   if (col_identity) { // Reorder b with the col permutation
1922     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1923     Y = factors->workVector;
1924   } else {
1925     B = factors->workVector;
1926     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1927     Y = x;
1928   }
1929   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1930 
1931   // Solve L^T X = Y
1932   if (row_identity) {
1933     X = x;
1934   } else {
1935     X = factors->workVector;
1936   }
1937   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1938 
1939   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1940   if (!row_identity) {
1941     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1942   }
1943 
1944   PetscCall(VecRestoreKokkosView(bb, &b));
1945   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1946   PetscCall(PetscLogGpuTimeEnd());
1947   PetscFunctionReturn(PETSC_SUCCESS);
1948 }
1949 
1950 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1951 {
1952   PetscFunctionBegin;
1953   PetscCall(MatSeqAIJKokkosSyncHost(A));
1954   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1955 
1956   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1957     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1958     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1959     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1960     const MatScalar            *Ba = b->a;
1961     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1962 
1963     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1964       // Allocate memory and copy the structure
1965       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1966       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1967       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1968       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1969       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1970       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1971 
1972       PetscInt *Li = factors->iL_h.data();
1973       PetscInt *Lj = factors->jL_h.data();
1974       PetscInt *Ui = factors->iU_h.data();
1975       PetscInt *Uj = factors->jU_h.data();
1976 
1977       Li[0] = Ui[0] = 0;
1978       for (PetscInt i = 0; i < m; i++) {
1979         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1980         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1981 
1982         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1983         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1984 
1985         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1986         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1987 
1988         Li[i + 1] = Li[i] + llen + 1;
1989         Ui[i + 1] = Ui[i] + ulen;
1990       }
1991 
1992       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1993       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1994       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1995       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1996       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1997       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1998 
1999       // Copy row/col permutation to device
2000       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2001       PetscBool row_identity;
2002       PetscCall(ISIdentity(rowperm, &row_identity));
2003       if (!row_identity) {
2004         const PetscInt *ip;
2005 
2006         PetscCall(ISGetIndices(rowperm, &ip));
2007         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
2008         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2009         PetscCall(ISRestoreIndices(rowperm, &ip));
2010         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2011       }
2012 
2013       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
2014       PetscBool col_identity;
2015       PetscCall(ISIdentity(colperm, &col_identity));
2016       if (!col_identity) {
2017         const PetscInt *ip;
2018 
2019         PetscCall(ISGetIndices(colperm, &ip));
2020         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2021         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2022         PetscCall(ISRestoreIndices(colperm, &ip));
2023         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2024       }
2025 
2026       /* Create sptrsv handles for L, U and their transpose */
2027 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2028       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2029 #else
2030       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2031 #endif
2032       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2033       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2034       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2035       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2036     }
2037 
2038     // Copy the value
2039     for (PetscInt i = 0; i < m; i++) {
2040       PetscInt        llen = Bi[i + 1] - Bi[i];
2041       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2042       const PetscInt *Li   = factors->iL_h.data();
2043       const PetscInt *Ui   = factors->iU_h.data();
2044 
2045       PetscScalar *La = factors->aL_h.data();
2046       PetscScalar *Ua = factors->aU_h.data();
2047 
2048       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2049       La[Li[i] + llen] = 1.0;                      // diagonal entry
2050 
2051       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2052       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2053     }
2054 
2055     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2056     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2057     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2058     factors->transpose_updated         = PETSC_FALSE;
2059     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2060 
2061     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2062     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2063   }
2064 
2065   B->ops->matsolve          = NULL;
2066   B->ops->matsolvetranspose = NULL;
2067   PetscFunctionReturn(PETSC_SUCCESS);
2068 }
2069 
2070 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2071 {
2072   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2073   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2074   PetscInt                    fill_lev = info->levels;
2075 
2076   PetscFunctionBegin;
2077   PetscCall(PetscLogGpuTimeBegin());
2078   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2079   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2080 
2081   auto a_d = aijkok->a_dual.view_device();
2082   auto i_d = aijkok->i_dual.view_device();
2083   auto j_d = aijkok->j_dual.view_device();
2084 
2085   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2086 
2087   B->assembled              = PETSC_TRUE;
2088   B->preallocated           = PETSC_TRUE;
2089   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2090   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2091   B->ops->matsolve          = NULL;
2092   B->ops->matsolvetranspose = NULL;
2093 
2094   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2095   factors->transpose_updated         = PETSC_FALSE;
2096   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2097   /* TODO: log flops, but how to know that? */
2098   PetscCall(PetscLogGpuTimeEnd());
2099   PetscFunctionReturn(PETSC_SUCCESS);
2100 }
2101 
2102 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2103 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2104 {
2105   Mat_SeqAIJKokkos           *aijkok;
2106   Mat_SeqAIJ                 *b;
2107   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2108   PetscInt                    fill_lev = info->levels;
2109   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2110   PetscInt                    n        = A->rmap->n;
2111 
2112   PetscFunctionBegin;
2113   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2114   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2115 
2116   /* Create a spiluk handle and then do symbolic factorization */
2117   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2118   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2119 
2120   auto spiluk_handle = factors->kh.get_spiluk_handle();
2121 
2122   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2123   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2124   Kokkos::realloc(factors->iU_d, n + 1);
2125   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2126 
2127   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2128   auto i_d = aijkok->i_dual.view_device();
2129   auto j_d = aijkok->j_dual.view_device();
2130   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2131   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2132 
2133   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2134   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2135   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2136   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2137 
2138   /* TODO: add options to select sptrsv algorithms */
2139   /* Create sptrsv handles for L, U and their transpose */
2140 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2141   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2142 #else
2143   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2144 #endif
2145 
2146   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2147   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2148   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2149   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2150 
2151   /* Fill fields of the factor matrix B */
2152   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2153   b     = (Mat_SeqAIJ *)B->data;
2154   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2155   B->info.fill_ratio_given  = info->fill;
2156   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2157 
2158   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2159   PetscFunctionReturn(PETSC_SUCCESS);
2160 }
2161 
2162 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2163 {
2164   PetscFunctionBegin;
2165   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2166   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2167   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2168   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2169   PetscFunctionReturn(PETSC_SUCCESS);
2170 }
2171 
2172 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2173 {
2174   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2175 
2176   PetscFunctionBegin;
2177   if (!info->factoronhost) {
2178     PetscCall(ISIdentity(isrow, &row_identity));
2179     PetscCall(ISIdentity(iscol, &col_identity));
2180   }
2181 
2182   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2183   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2184 
2185   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2186     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2187   } else {
2188     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2189     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2190   }
2191   PetscFunctionReturn(PETSC_SUCCESS);
2192 }
2193 
2194 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2195 {
2196   PetscFunctionBegin;
2197   PetscCall(MatSeqAIJKokkosSyncHost(A));
2198   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2199 
2200   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2201     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2202     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2203     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2204     const MatScalar            *Ba = b->a;
2205     PetscInt                    m  = B->rmap->n;
2206 
2207     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2208       // Allocate memory and copy the structure
2209       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2210       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2211       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2212       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2213       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2214       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2215 
2216       // Build jU_h from the skewed Aj
2217       PetscInt *Uj = factors->jU_h.data();
2218       for (PetscInt i = 0; i < m; i++) {
2219         PetscInt ulen = Bi[i + 1] - Bi[i];
2220         Uj[Bi[i]]     = i;                                              // diagonal entry
2221         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2222       }
2223 
2224       // Copy iU, jU to device
2225       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2226       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2227 
2228       // Copy row/col permutation to device
2229       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2230       PetscBool row_identity;
2231       PetscCall(ISIdentity(rowperm, &row_identity));
2232       if (!row_identity) {
2233         const PetscInt *ip;
2234 
2235         PetscCall(ISGetIndices(rowperm, &ip));
2236         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2237         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2238         PetscCall(ISRestoreIndices(rowperm, &ip));
2239         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2240       }
2241 
2242       // Create sptrsv handles for U and U^T
2243 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2244       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2245 #else
2246       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2247 #endif
2248       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2249       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2250     }
2251     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2252     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2253     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2254 
2255     // Copy the value
2256     PetscScalar *Ua = factors->aU_h.data();
2257     PetscScalar *D  = factors->D_h.data();
2258     for (PetscInt i = 0; i < m; i++) {
2259       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2260       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2261       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2262     }
2263     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2264     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2265 
2266     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2267     factors->transpose_updated         = PETSC_FALSE;
2268   }
2269 
2270   B->ops->matsolve          = NULL;
2271   B->ops->matsolvetranspose = NULL;
2272   PetscFunctionReturn(PETSC_SUCCESS);
2273 }
2274 
2275 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2276 {
2277   PetscFunctionBegin;
2278   if (info->solveonhost) {
2279     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2280     PetscCall(MatSetType(B, MATSEQSBAIJ));
2281     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2282   }
2283 
2284   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2285 
2286   if (!info->solveonhost) {
2287     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2288     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2289     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2290     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2291   }
2292   PetscFunctionReturn(PETSC_SUCCESS);
2293 }
2294 
2295 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2296 {
2297   PetscFunctionBegin;
2298   if (info->solveonhost) {
2299     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2300     PetscCall(MatSetType(B, MATSEQSBAIJ));
2301     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2302   }
2303 
2304   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2305 
2306   if (!info->solveonhost) {
2307     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2308     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2309     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2310     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2311   }
2312   PetscFunctionReturn(PETSC_SUCCESS);
2313 }
2314 
2315 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2316 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2317 {
2318   PetscFunctionBegin;
2319   *type = MATSOLVERKOKKOS;
2320   PetscFunctionReturn(PETSC_SUCCESS);
2321 }
2322 
2323 /*MC
2324   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2325   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2326 
2327   Level: beginner
2328 
2329 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2330 M*/
2331 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2332 {
2333   PetscInt n = A->rmap->n;
2334   MPI_Comm comm;
2335 
2336   PetscFunctionBegin;
2337   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2338   PetscCall(MatCreate(comm, B));
2339   PetscCall(MatSetSizes(*B, n, n, n, n));
2340   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2341   (*B)->factortype = ftype;
2342   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2343   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2344   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2345 
2346   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2347     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2348     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2349     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2350     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2351     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2352   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2353     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2354     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2355     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2356     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2357   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2358 
2359   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2360   (*B)->canuseordering = PETSC_TRUE;
2361   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2362   PetscFunctionReturn(PETSC_SUCCESS);
2363 }
2364 
2365 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2366 {
2367   PetscFunctionBegin;
2368   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2369   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2370   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2371   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2372   PetscFunctionReturn(PETSC_SUCCESS);
2373 }
2374 
2375 /* Utility to print out a KokkosCsrMatrix for debugging */
2376 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2377 {
2378   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2379   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2380   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2381   const PetscInt    *i  = iv.data();
2382   const PetscInt    *j  = jv.data();
2383   const PetscScalar *a  = av.data();
2384   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2385 
2386   PetscFunctionBegin;
2387   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2388   for (PetscInt k = 0; k < m; k++) {
2389     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2390     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2391     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2392   }
2393   PetscFunctionReturn(PETSC_SUCCESS);
2394 }
2395