xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision ad49c6e3020e95528bc71effe5cd79a1b737ce7c)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20 #include <KokkosSparse_spmv.hpp>
21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22 
23 #include <KokkosSparse_spiluk.hpp>
24 #include <KokkosSparse_sptrsv.hpp>
25 #include <KokkosSparse_spgemm.hpp>
26 #include <KokkosSparse_spadd.hpp>
27 #include <KokkosBatched_LU_Decl.hpp>
28 #include <KokkosBatched_InverseLU_Decl.hpp>
29 
30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31 
32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33   #include <KokkosSparse_Utils.hpp>
34 using KokkosSparse::sort_crs_matrix;
35 using KokkosSparse::Impl::transpose_matrix;
36 #else
37   #include <KokkosKernels_Sorting.hpp>
38 using KokkosKernels::sort_crs_matrix;
39 using KokkosKernels::Impl::transpose_matrix;
40 #endif
41 
42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43 using KokkosSparse::spiluk_symbolic;
44 using KokkosSparse::spiluk_numeric;
45 using KokkosSparse::sptrsv_symbolic;
46 using KokkosSparse::sptrsv_solve;
47 using KokkosSparse::Experimental::SPTRSVAlgorithm;
48 using KokkosSparse::Experimental::SPILUKAlgorithm;
49 #else
50 using KokkosSparse::Experimental::spiluk_symbolic;
51 using KokkosSparse::Experimental::spiluk_numeric;
52 using KokkosSparse::Experimental::sptrsv_symbolic;
53 using KokkosSparse::Experimental::sptrsv_solve;
54 using KokkosSparse::Experimental::SPTRSVAlgorithm;
55 using KokkosSparse::Experimental::SPILUKAlgorithm;
56 #endif
57 
58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59 
60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62    In the latter case, it is important to set a_dual's sync state correctly.
63  */
64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65 {
66   Mat_SeqAIJ       *aijseq;
67   Mat_SeqAIJKokkos *aijkok;
68 
69   PetscFunctionBegin;
70   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72 
73   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75 
76   /* If aijkok does not exist, we just copy i, j to device.
77      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78      In both cases, we build a new aijkok structure.
79   */
80   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
82       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
83       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
84       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
85       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
86       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
87       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
88       aijseq->free_a  = PETSC_TRUE;
89       aijseq->free_ij = PETSC_TRUE;
90       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
91          we have the CsrMatrix on device first and then copy to host, followed by
92          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
93          One could improve it by not using NULL garray.
94       */
95     }
96     delete aijkok;
97     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
98     A->spptr = aijkok;
99   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
100     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
101     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
102     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
103   }
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 
107 /* Sync CSR data to device if not yet */
108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111 
112   PetscFunctionBegin;
113   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
114   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
115   if (aijkok->a_dual.need_sync_device()) {
116     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
117     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
118     aijkok->hermitian_updated = PETSC_FALSE;
119   }
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 /* Mark the CSR data on device as modified */
124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127 
128   PetscFunctionBegin;
129   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
130   aijkok->a_dual.clear_sync_state();
131   aijkok->a_dual.modify_device();
132   aijkok->transpose_updated = PETSC_FALSE;
133   aijkok->hermitian_updated = PETSC_FALSE;
134   PetscCall(MatSeqAIJInvalidateDiagonal(A));
135   PetscCall(PetscObjectStateIncrease((PetscObject)A));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140 {
141   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142   auto              exec   = PetscGetKokkosExecutionSpace();
143 
144   PetscFunctionBegin;
145   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
146   /* We do not expect one needs factors on host  */
147   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
148   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
150   PetscFunctionReturn(PETSC_SUCCESS);
151 }
152 
153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154 {
155   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
156 
157   PetscFunctionBegin;
158   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
159     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
160     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
161     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
162   */
163   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
165     *array = aijkok->a_dual.view_host().data();
166   } else { /* Happens when calling MatSetValues on a newly created matrix */
167     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
168   }
169   PetscFunctionReturn(PETSC_SUCCESS);
170 }
171 
172 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
173 {
174   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
175 
176   PetscFunctionBegin;
177   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
178   PetscFunctionReturn(PETSC_SUCCESS);
179 }
180 
181 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
182 {
183   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
184 
185   PetscFunctionBegin;
186   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
187     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
188     *array = aijkok->a_dual.view_host().data();
189   } else {
190     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
191   }
192   PetscFunctionReturn(PETSC_SUCCESS);
193 }
194 
195 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
196 {
197   PetscFunctionBegin;
198   *array = NULL;
199   PetscFunctionReturn(PETSC_SUCCESS);
200 }
201 
202 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
203 {
204   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
205 
206   PetscFunctionBegin;
207   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
208     *array = aijkok->a_dual.view_host().data();
209   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
210     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
211   }
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 
215 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
216 {
217   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
218 
219   PetscFunctionBegin;
220   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
221     aijkok->a_dual.clear_sync_state();
222     aijkok->a_dual.modify_host();
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
228 {
229   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
230 
231   PetscFunctionBegin;
232   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
233 
234   if (i) *i = aijkok->i_device_data();
235   if (j) *j = aijkok->j_device_data();
236   if (a) {
237     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
238     *a = aijkok->a_device_data();
239   }
240   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243 
244 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
245 {
246   PetscFunctionBegin;
247   *mtype = PETSC_MEMTYPE_KOKKOS;
248   PetscFunctionReturn(PETSC_SUCCESS);
249 }
250 
251 /*
252   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
253 
254   Input Parameter:
255 .  A       - the MATSEQAIJKOKKOS matrix
256 
257   Output Parameters:
258 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
259 -  T_d    - the transpose on device, whose value array is allocated but not initialized
260 */
261 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
262 {
263   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
264   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
265   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
266   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
267   MatRowMapType          *Ti = Ti_h.data();
268   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
269   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
270   PetscInt               *Tj   = Tj_h.data();
271   PetscInt               *perm = perm_h.data();
272   PetscInt               *offset;
273 
274   PetscFunctionBegin;
275   // Populate Ti
276   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
277   Ti++;
278   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
279   Ti--;
280   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
281 
282   // Populate Tj and the permutation array
283   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
284   for (PetscInt i = 0; i < m; i++) {
285     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
286       PetscInt r    = Aj[j];                       // row r of T
287       PetscInt disp = Ti[r] + offset[r];
288 
289       Tj[disp]   = i; // col i of T
290       perm[disp] = j;
291       offset[r]++;
292     }
293   }
294   PetscCall(PetscFree(offset));
295 
296   // Sort each row of T, along with the permutation array
297   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
298 
299   // Output perm and T on device
300   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
301   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
302   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
303   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
304   PetscFunctionReturn(PETSC_SUCCESS);
305 }
306 
307 // Generate the transpose on device and cache it internally
308 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
309 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
310 {
311   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
312   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
313   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
314   KokkosCsrMatrix  &T = akok->csrmatT;
315 
316   PetscFunctionBegin;
317   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
318   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
319 
320   const auto &Aa = akok->a_dual.view_device();
321 
322   if (A->symmetric == PETSC_BOOL3_TRUE) {
323     *csrmatT = akok->csrmat;
324   } else {
325     // See if we already have a cached transpose and its value is up to date
326     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
327       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
328         const auto &perm = akok->transpose_perm; // get the permutation array
329         auto       &Ta   = T.values;
330 
331         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
332       }
333     } else { // Generate T of size n x m for the first time
334       MatRowMapKokkosView perm;
335 
336       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
337       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
338       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
339     }
340     akok->transpose_updated = PETSC_TRUE;
341     *csrmatT                = akok->csrmatT;
342   }
343   PetscFunctionReturn(PETSC_SUCCESS);
344 }
345 
346 // Generate the Hermitian on device and cache it internally
347 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
348 {
349   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
350   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
351   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
352   KokkosCsrMatrix  &T = akok->csrmatH;
353 
354   PetscFunctionBegin;
355   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
356   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
357 
358   const auto &Aa = akok->a_dual.view_device();
359 
360   if (A->hermitian == PETSC_BOOL3_TRUE) {
361     *csrmatH = akok->csrmat;
362   } else {
363     // See if we already have a cached hermitian and its value is up to date
364     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
365       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
366         const auto &perm = akok->transpose_perm; // get the permutation array
367         auto       &Ta   = T.values;
368 
369         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
370       }
371     } else { // Generate T of size n x m for the first time
372       MatRowMapKokkosView perm;
373 
374       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
375       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
376       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
377     }
378     akok->hermitian_updated = PETSC_TRUE;
379     *csrmatH                = akok->csrmatH;
380   }
381   PetscFunctionReturn(PETSC_SUCCESS);
382 }
383 
384 /* y = A x */
385 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
386 {
387   Mat_SeqAIJKokkos          *aijkok;
388   ConstPetscScalarKokkosView xv;
389   PetscScalarKokkosView      yv;
390 
391   PetscFunctionBegin;
392   PetscCall(PetscLogGpuTimeBegin());
393   PetscCall(MatSeqAIJKokkosSyncDevice(A));
394   PetscCall(VecGetKokkosView(xx, &xv));
395   PetscCall(VecGetKokkosViewWrite(yy, &yv));
396   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
397   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
398   PetscCall(VecRestoreKokkosView(xx, &xv));
399   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
400   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
401   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
402   PetscCall(PetscLogGpuTimeEnd());
403   PetscFunctionReturn(PETSC_SUCCESS);
404 }
405 
406 /* y = A^T x */
407 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
408 {
409   Mat_SeqAIJKokkos          *aijkok;
410   const char                *mode;
411   ConstPetscScalarKokkosView xv;
412   PetscScalarKokkosView      yv;
413   KokkosCsrMatrix            csrmat;
414 
415   PetscFunctionBegin;
416   PetscCall(PetscLogGpuTimeBegin());
417   PetscCall(MatSeqAIJKokkosSyncDevice(A));
418   PetscCall(VecGetKokkosView(xx, &xv));
419   PetscCall(VecGetKokkosViewWrite(yy, &yv));
420   if (A->form_explicit_transpose) {
421     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
422     mode = "N";
423   } else {
424     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
425     csrmat = aijkok->csrmat;
426     mode   = "T";
427   }
428   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
429   PetscCall(VecRestoreKokkosView(xx, &xv));
430   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
431   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
432   PetscCall(PetscLogGpuTimeEnd());
433   PetscFunctionReturn(PETSC_SUCCESS);
434 }
435 
436 /* y = A^H x */
437 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
438 {
439   Mat_SeqAIJKokkos          *aijkok;
440   const char                *mode;
441   ConstPetscScalarKokkosView xv;
442   PetscScalarKokkosView      yv;
443   KokkosCsrMatrix            csrmat;
444 
445   PetscFunctionBegin;
446   PetscCall(PetscLogGpuTimeBegin());
447   PetscCall(MatSeqAIJKokkosSyncDevice(A));
448   PetscCall(VecGetKokkosView(xx, &xv));
449   PetscCall(VecGetKokkosViewWrite(yy, &yv));
450   if (A->form_explicit_transpose) {
451     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
452     mode = "N";
453   } else {
454     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
455     csrmat = aijkok->csrmat;
456     mode   = "C";
457   }
458   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
459   PetscCall(VecRestoreKokkosView(xx, &xv));
460   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
461   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
462   PetscCall(PetscLogGpuTimeEnd());
463   PetscFunctionReturn(PETSC_SUCCESS);
464 }
465 
466 /* z = A x + y */
467 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
468 {
469   Mat_SeqAIJKokkos          *aijkok;
470   ConstPetscScalarKokkosView xv;
471   PetscScalarKokkosView      zv;
472 
473   PetscFunctionBegin;
474   PetscCall(PetscLogGpuTimeBegin());
475   PetscCall(MatSeqAIJKokkosSyncDevice(A));
476   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
477   PetscCall(VecGetKokkosView(xx, &xv));
478   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
479   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
480   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
481   PetscCall(VecRestoreKokkosView(xx, &xv));
482   PetscCall(VecRestoreKokkosView(zz, &zv));
483   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
484   PetscCall(PetscLogGpuTimeEnd());
485   PetscFunctionReturn(PETSC_SUCCESS);
486 }
487 
488 /* z = A^T x + y */
489 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
490 {
491   Mat_SeqAIJKokkos          *aijkok;
492   const char                *mode;
493   ConstPetscScalarKokkosView xv;
494   PetscScalarKokkosView      zv;
495   KokkosCsrMatrix            csrmat;
496 
497   PetscFunctionBegin;
498   PetscCall(PetscLogGpuTimeBegin());
499   PetscCall(MatSeqAIJKokkosSyncDevice(A));
500   if (zz != yy) PetscCall(VecCopy(yy, zz));
501   PetscCall(VecGetKokkosView(xx, &xv));
502   PetscCall(VecGetKokkosView(zz, &zv));
503   if (A->form_explicit_transpose) {
504     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
505     mode = "N";
506   } else {
507     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
508     csrmat = aijkok->csrmat;
509     mode   = "T";
510   }
511   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
512   PetscCall(VecRestoreKokkosView(xx, &xv));
513   PetscCall(VecRestoreKokkosView(zz, &zv));
514   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
515   PetscCall(PetscLogGpuTimeEnd());
516   PetscFunctionReturn(PETSC_SUCCESS);
517 }
518 
519 /* z = A^H x + y */
520 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
521 {
522   Mat_SeqAIJKokkos          *aijkok;
523   const char                *mode;
524   ConstPetscScalarKokkosView xv;
525   PetscScalarKokkosView      zv;
526   KokkosCsrMatrix            csrmat;
527 
528   PetscFunctionBegin;
529   PetscCall(PetscLogGpuTimeBegin());
530   PetscCall(MatSeqAIJKokkosSyncDevice(A));
531   if (zz != yy) PetscCall(VecCopy(yy, zz));
532   PetscCall(VecGetKokkosView(xx, &xv));
533   PetscCall(VecGetKokkosView(zz, &zv));
534   if (A->form_explicit_transpose) {
535     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
536     mode = "N";
537   } else {
538     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
539     csrmat = aijkok->csrmat;
540     mode   = "C";
541   }
542   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
543   PetscCall(VecRestoreKokkosView(xx, &xv));
544   PetscCall(VecRestoreKokkosView(zz, &zv));
545   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
546   PetscCall(PetscLogGpuTimeEnd());
547   PetscFunctionReturn(PETSC_SUCCESS);
548 }
549 
550 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
551 {
552   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
553 
554   PetscFunctionBegin;
555   switch (op) {
556   case MAT_FORM_EXPLICIT_TRANSPOSE:
557     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
558     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
559     A->form_explicit_transpose = flg;
560     break;
561   default:
562     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
563     break;
564   }
565   PetscFunctionReturn(PETSC_SUCCESS);
566 }
567 
568 /* Depending on reuse, either build a new mat, or use the existing mat */
569 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
570 {
571   Mat_SeqAIJ *aseq;
572 
573   PetscFunctionBegin;
574   PetscCall(PetscKokkosInitializeCheck());
575   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
576     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
577     PetscCall(MatSetType(*newmat, mtype));
578   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
579     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
580   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
581     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
582     PetscCall(PetscFree(A->defaultvectype));
583     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
584     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
585     PetscCall(MatSetOps_SeqAIJKokkos(A));
586     aseq = static_cast<Mat_SeqAIJ *>(A->data);
587     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
588       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
589       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
590     }
591   }
592   PetscFunctionReturn(PETSC_SUCCESS);
593 }
594 
595 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
596    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
597  */
598 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
599 {
600   Mat_SeqAIJ       *bseq;
601   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
602   Mat               mat;
603 
604   PetscFunctionBegin;
605   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
606   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
607   mat = *B;
608   if (A->assembled) {
609     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
610     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
611     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
612     /* Now copy values to B if needed */
613     if (dupOption == MAT_COPY_VALUES) {
614       if (akok->a_dual.need_sync_device()) {
615         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
616         bkok->a_dual.modify_host();
617       } else { /* If device has the latest data, we only copy data on device */
618         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
619         bkok->a_dual.modify_device();
620       }
621     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
622       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
623       bkok->a_dual.modify_host();
624     }
625     mat->spptr = bkok;
626   }
627 
628   PetscCall(PetscFree(mat->defaultvectype));
629   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
630   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
631   PetscCall(MatSetOps_SeqAIJKokkos(mat));
632   PetscFunctionReturn(PETSC_SUCCESS);
633 }
634 
635 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
636 {
637   Mat               At;
638   KokkosCsrMatrix   internT;
639   Mat_SeqAIJKokkos *atkok, *bkok;
640 
641   PetscFunctionBegin;
642   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
643   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
644   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
645     /* Deep copy internT, as we want to isolate the internal transpose */
646     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
647     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
648     if (reuse == MAT_INITIAL_MATRIX) *B = At;
649     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
650   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
651     if ((*B)->assembled) {
652       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
653       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
654       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
655     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
656       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
657       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
658       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
659       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
660       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
661     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
662   }
663   PetscFunctionReturn(PETSC_SUCCESS);
664 }
665 
666 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
667 {
668   Mat_SeqAIJKokkos *aijkok;
669 
670   PetscFunctionBegin;
671   if (A->factortype == MAT_FACTOR_NONE) {
672     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
673     delete aijkok;
674   } else {
675     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
676   }
677   A->spptr = NULL;
678   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
679   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
680   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
681 #if defined(PETSC_HAVE_HYPRE)
682   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
683 #endif
684   PetscCall(MatDestroy_SeqAIJ(A));
685   PetscFunctionReturn(PETSC_SUCCESS);
686 }
687 
688 /*MC
689    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
690 
691    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
692 
693    Options Database Key:
694 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
695 
696   Level: beginner
697 
698 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
699 M*/
700 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
701 {
702   PetscFunctionBegin;
703   PetscCall(PetscKokkosInitializeCheck());
704   PetscCall(MatCreate_SeqAIJ(A));
705   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
706   PetscFunctionReturn(PETSC_SUCCESS);
707 }
708 
709 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
710 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
711 {
712   Mat_SeqAIJ         *a, *b;
713   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
714   MatScalarKokkosView aa, ba, ca;
715   MatRowMapKokkosView ai, bi, ci;
716   MatColIdxKokkosView aj, bj, cj;
717   PetscInt            m, n, nnz, aN;
718 
719   PetscFunctionBegin;
720   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
721   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
722   PetscAssertPointer(C, 4);
723   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
724   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
725   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
726   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
727 
728   PetscCall(MatSeqAIJKokkosSyncDevice(A));
729   PetscCall(MatSeqAIJKokkosSyncDevice(B));
730   a    = static_cast<Mat_SeqAIJ *>(A->data);
731   b    = static_cast<Mat_SeqAIJ *>(B->data);
732   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
733   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
734   aa   = akok->a_dual.view_device();
735   ai   = akok->i_dual.view_device();
736   ba   = bkok->a_dual.view_device();
737   bi   = bkok->i_dual.view_device();
738   m    = A->rmap->n; /* M, N and nnz of C */
739   n    = A->cmap->n + B->cmap->n;
740   nnz  = a->nz + b->nz;
741   aN   = A->cmap->n; /* N of A */
742   if (reuse == MAT_INITIAL_MATRIX) {
743     aj      = akok->j_dual.view_device();
744     bj      = bkok->j_dual.view_device();
745     auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
746     auto ci = MatRowMapKokkosView("i", ai.extent(0));
747     auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
748 
749     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
750     Kokkos::parallel_for(
751       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
752         PetscInt i       = t.league_rank(); /* row i */
753         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
754 
755         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
756                                                    ci(i) = coffset;
757                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
758         });
759 
760         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
761           if (k < alen) {
762             ca(coffset + k) = aa(ai(i) + k);
763             cj(coffset + k) = aj(ai(i) + k);
764           } else {
765             ca(coffset + k) = ba(bi(i) + k - alen);
766             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
767           }
768         });
769       });
770     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
771     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
772   } else if (reuse == MAT_REUSE_MATRIX) {
773     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
774     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
775     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
776     ca   = ckok->a_dual.view_device();
777     ci   = ckok->i_dual.view_device();
778 
779     Kokkos::parallel_for(
780       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
781         PetscInt i    = t.league_rank(); /* row i */
782         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
783         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
784           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
785           else ca(ci(i) + k) = ba(bi(i) + k - alen);
786         });
787       });
788     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
789   }
790   PetscFunctionReturn(PETSC_SUCCESS);
791 }
792 
793 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
794 {
795   PetscFunctionBegin;
796   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
797   PetscFunctionReturn(PETSC_SUCCESS);
798 }
799 
800 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
801 {
802   Mat_Product                 *product = C->product;
803   Mat                          A, B;
804   bool                         transA, transB; /* use bool, since KK needs this type */
805   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
806   Mat_SeqAIJ                  *c;
807   MatProductData_SeqAIJKokkos *pdata;
808   KokkosCsrMatrix              csrmatA, csrmatB;
809 
810   PetscFunctionBegin;
811   MatCheckProduct(C, 1);
812   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
813   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
814 
815   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
816   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
817   // we still do numeric.
818   if (pdata->reusesym) { // numeric reuses results from symbolic
819     pdata->reusesym = PETSC_FALSE;
820     PetscFunctionReturn(PETSC_SUCCESS);
821   }
822 
823   switch (product->type) {
824   case MATPRODUCT_AB:
825     transA = false;
826     transB = false;
827     break;
828   case MATPRODUCT_AtB:
829     transA = true;
830     transB = false;
831     break;
832   case MATPRODUCT_ABt:
833     transA = false;
834     transB = true;
835     break;
836   default:
837     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
838   }
839 
840   A = product->A;
841   B = product->B;
842   PetscCall(MatSeqAIJKokkosSyncDevice(A));
843   PetscCall(MatSeqAIJKokkosSyncDevice(B));
844   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
845   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
846   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
847 
848   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
849 
850   csrmatA = akok->csrmat;
851   csrmatB = bkok->csrmat;
852 
853   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
854   if (transA) {
855     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
856     transA = false;
857   }
858 
859   if (transB) {
860     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
861     transB = false;
862   }
863   PetscCall(PetscLogGpuTimeBegin());
864   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
865 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
866   auto spgemmHandle = pdata->kh.get_spgemm_handle();
867   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
868 #endif
869 
870   PetscCall(PetscLogGpuTimeEnd());
871   PetscCall(MatSeqAIJKokkosModifyDevice(C));
872   /* shorter version of MatAssemblyEnd_SeqAIJ */
873   c = (Mat_SeqAIJ *)C->data;
874   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
875   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
876   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
877   c->reallocs         = 0;
878   C->info.mallocs     = 0;
879   C->info.nz_unneeded = 0;
880   C->assembled = C->was_assembled = PETSC_TRUE;
881   C->num_ass++;
882   PetscFunctionReturn(PETSC_SUCCESS);
883 }
884 
885 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
886 {
887   Mat_Product                 *product = C->product;
888   MatProductType               ptype;
889   Mat                          A, B;
890   bool                         transA, transB;
891   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
892   MatProductData_SeqAIJKokkos *pdata;
893   MPI_Comm                     comm;
894   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
895 
896   PetscFunctionBegin;
897   MatCheckProduct(C, 1);
898   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
899   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
900   A = product->A;
901   B = product->B;
902   PetscCall(MatSeqAIJKokkosSyncDevice(A));
903   PetscCall(MatSeqAIJKokkosSyncDevice(B));
904   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
905   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
906   csrmatA = akok->csrmat;
907   csrmatB = bkok->csrmat;
908 
909   ptype = product->type;
910   // Take advantage of the symmetry if true
911   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
912     ptype                                          = MATPRODUCT_AB;
913     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
914   }
915   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
916     ptype                                          = MATPRODUCT_AB;
917     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
918   }
919 
920   switch (ptype) {
921   case MATPRODUCT_AB:
922     transA = false;
923     transB = false;
924     PetscCall(MatSetBlockSizesFromMats(C, A, B));
925     break;
926   case MATPRODUCT_AtB:
927     transA = true;
928     transB = false;
929     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
930     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
931     break;
932   case MATPRODUCT_ABt:
933     transA = false;
934     transB = true;
935     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
936     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
937     break;
938   default:
939     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
940   }
941   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
942   pdata->reusesym = product->api_user;
943 
944   /* TODO: add command line options to select spgemm algorithms */
945   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
946 
947   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
948 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
949   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
950   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
951   #endif
952 #endif
953   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
954 
955   PetscCall(PetscLogGpuTimeBegin());
956   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
957   if (transA) {
958     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
959     transA = false;
960   }
961 
962   if (transB) {
963     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
964     transB = false;
965   }
966 
967   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
968   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
969     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
970     calling new Mat_SeqAIJKokkos().
971     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
972   */
973   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
974 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
975   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
976   auto spgemmHandle = pdata->kh.get_spgemm_handle();
977   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
978 #endif
979   PetscCall(PetscLogGpuTimeEnd());
980 
981   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
982   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
983   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
984   PetscFunctionReturn(PETSC_SUCCESS);
985 }
986 
987 /* handles sparse matrix matrix ops */
988 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
989 {
990   Mat_Product *product = mat->product;
991   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
992 
993   PetscFunctionBegin;
994   MatCheckProduct(mat, 1);
995   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
996   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
997   if (Biskok && Ciskok) {
998     switch (product->type) {
999     case MATPRODUCT_AB:
1000     case MATPRODUCT_AtB:
1001     case MATPRODUCT_ABt:
1002       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1003       break;
1004     case MATPRODUCT_PtAP:
1005     case MATPRODUCT_RARt:
1006     case MATPRODUCT_ABC:
1007       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1008       break;
1009     default:
1010       break;
1011     }
1012   } else { /* fallback for AIJ */
1013     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1014   }
1015   PetscFunctionReturn(PETSC_SUCCESS);
1016 }
1017 
1018 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1019 {
1020   Mat_SeqAIJKokkos *aijkok;
1021 
1022   PetscFunctionBegin;
1023   PetscCall(PetscLogGpuTimeBegin());
1024   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1025   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1026   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1027   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1028   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1029   PetscCall(PetscLogGpuTimeEnd());
1030   PetscFunctionReturn(PETSC_SUCCESS);
1031 }
1032 
1033 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1034 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1035 {
1036   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1037 
1038   PetscFunctionBegin;
1039   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1040     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1041 
1042     PetscCall(PetscLogGpuTimeBegin());
1043     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1044     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1045     const auto &Aa     = aijkok->a_dual.view_device();
1046     const auto &Adiag  = aijkok->diag_dual.view_device();
1047     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1048     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1049     PetscCall(PetscLogGpuFlops(n));
1050     PetscCall(PetscLogGpuTimeEnd());
1051   } else { // need reassembly, very slow!
1052     PetscCall(MatShift_Basic(A, a));
1053   }
1054   PetscFunctionReturn(PETSC_SUCCESS);
1055 }
1056 
1057 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1058 {
1059   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1060 
1061   PetscFunctionBegin;
1062   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1063     ConstPetscScalarKokkosView dv;
1064     PetscInt                   n, nv;
1065 
1066     PetscCall(PetscLogGpuTimeBegin());
1067     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1068     PetscCall(VecGetKokkosView(D, &dv));
1069     PetscCall(VecGetLocalSize(D, &nv));
1070     n = PetscMin(Y->rmap->n, Y->cmap->n);
1071     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1072 
1073     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1074     const auto &Aa     = aijkok->a_dual.view_device();
1075     const auto &Adiag  = aijkok->diag_dual.view_device();
1076     PetscCallCXX(Kokkos::parallel_for(
1077       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1078         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1079         else Aa(Adiag(i)) += dv(i);
1080       }));
1081     PetscCall(VecRestoreKokkosView(D, &dv));
1082     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1083     PetscCall(PetscLogGpuFlops(n));
1084     PetscCall(PetscLogGpuTimeEnd());
1085   } else { // need reassembly, very slow!
1086     PetscCall(MatDiagonalSet_Default(Y, D, is));
1087   }
1088   PetscFunctionReturn(PETSC_SUCCESS);
1089 }
1090 
1091 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1092 {
1093   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1094   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1095   ConstPetscScalarKokkosView lv, rv;
1096 
1097   PetscFunctionBegin;
1098   PetscCall(PetscLogGpuTimeBegin());
1099   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1100   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1101   const auto &Aa     = aijkok->a_dual.view_device();
1102   const auto &Ai     = aijkok->i_dual.view_device();
1103   const auto &Aj     = aijkok->j_dual.view_device();
1104   if (ll) {
1105     PetscCall(VecGetLocalSize(ll, &m));
1106     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1107     PetscCall(VecGetKokkosView(ll, &lv));
1108     PetscCallCXX(Kokkos::parallel_for( // for each row
1109       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1110         PetscInt i   = t.league_rank(); // row i
1111         PetscInt len = Ai(i + 1) - Ai(i);
1112         // scale entries on the row
1113         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1114       }));
1115     PetscCall(VecRestoreKokkosView(ll, &lv));
1116     PetscCall(PetscLogGpuFlops(nz));
1117   }
1118   if (rr) {
1119     PetscCall(VecGetLocalSize(rr, &n));
1120     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1121     PetscCall(VecGetKokkosView(rr, &rv));
1122     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1123       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1124     PetscCall(VecRestoreKokkosView(rr, &lv));
1125     PetscCall(PetscLogGpuFlops(nz));
1126   }
1127   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1128   PetscCall(PetscLogGpuTimeEnd());
1129   PetscFunctionReturn(PETSC_SUCCESS);
1130 }
1131 
1132 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1133 {
1134   Mat_SeqAIJKokkos *aijkok;
1135 
1136   PetscFunctionBegin;
1137   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1138   if (aijkok) { /* Only zero the device if data is already there */
1139     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1140     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1141   } else { /* Might be preallocated but not assembled */
1142     PetscCall(MatZeroEntries_SeqAIJ(A));
1143   }
1144   PetscFunctionReturn(PETSC_SUCCESS);
1145 }
1146 
1147 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1148 {
1149   Mat_SeqAIJKokkos     *aijkok;
1150   PetscInt              n;
1151   PetscScalarKokkosView xv;
1152 
1153   PetscFunctionBegin;
1154   PetscCall(VecGetLocalSize(x, &n));
1155   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1156   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1157 
1158   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1159   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1160 
1161   const auto &Aa    = aijkok->a_dual.view_device();
1162   const auto &Ai    = aijkok->i_dual.view_device();
1163   const auto &Adiag = aijkok->diag_dual.view_device();
1164 
1165   PetscCall(VecGetKokkosViewWrite(x, &xv));
1166   Kokkos::parallel_for(
1167     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1168       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1169       else xv(i) = 0;
1170     });
1171   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1172   PetscFunctionReturn(PETSC_SUCCESS);
1173 }
1174 
1175 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1176 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1177 {
1178   Mat_SeqAIJKokkos *aijkok;
1179 
1180   PetscFunctionBegin;
1181   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1182   PetscAssertPointer(kv, 2);
1183   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1184   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1185   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1186   *kv    = aijkok->a_dual.view_device();
1187   PetscFunctionReturn(PETSC_SUCCESS);
1188 }
1189 
1190 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1191 {
1192   PetscFunctionBegin;
1193   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1194   PetscAssertPointer(kv, 2);
1195   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1196   PetscFunctionReturn(PETSC_SUCCESS);
1197 }
1198 
1199 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1200 {
1201   Mat_SeqAIJKokkos *aijkok;
1202 
1203   PetscFunctionBegin;
1204   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1205   PetscAssertPointer(kv, 2);
1206   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1207   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1208   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1209   *kv    = aijkok->a_dual.view_device();
1210   PetscFunctionReturn(PETSC_SUCCESS);
1211 }
1212 
1213 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1214 {
1215   PetscFunctionBegin;
1216   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1217   PetscAssertPointer(kv, 2);
1218   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1219   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1220   PetscFunctionReturn(PETSC_SUCCESS);
1221 }
1222 
1223 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1224 {
1225   Mat_SeqAIJKokkos *aijkok;
1226 
1227   PetscFunctionBegin;
1228   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1229   PetscAssertPointer(kv, 2);
1230   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1231   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1232   *kv    = aijkok->a_dual.view_device();
1233   PetscFunctionReturn(PETSC_SUCCESS);
1234 }
1235 
1236 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1237 {
1238   PetscFunctionBegin;
1239   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1240   PetscAssertPointer(kv, 2);
1241   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1242   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1243   PetscFunctionReturn(PETSC_SUCCESS);
1244 }
1245 
1246 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1247 {
1248   Mat_SeqAIJKokkos *akok;
1249 
1250   PetscFunctionBegin;
1251   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1252   PetscCall(MatCreate(comm, A));
1253   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1254   PetscFunctionReturn(PETSC_SUCCESS);
1255 }
1256 
1257 /* Computes Y += alpha X */
1258 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1259 {
1260   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1261   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1262   ConstMatScalarKokkosView Xa;
1263   MatScalarKokkosView      Ya;
1264   auto                     exec = PetscGetKokkosExecutionSpace();
1265 
1266   PetscFunctionBegin;
1267   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1268   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1269   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1270   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1271   PetscCall(PetscLogGpuTimeBegin());
1272 
1273   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1274     PetscBool e;
1275     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1276     if (e) {
1277       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1278       if (e) pattern = SAME_NONZERO_PATTERN;
1279     }
1280   }
1281 
1282   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1283     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1284     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1285     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1286   */
1287   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1288   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1289   Xa   = xkok->a_dual.view_device();
1290   Ya   = ykok->a_dual.view_device();
1291 
1292   if (pattern == SAME_NONZERO_PATTERN) {
1293     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1294     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1295   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1296     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1297     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1298 
1299     Kokkos::parallel_for(
1300       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1301         PetscInt i = t.league_rank(); // row i
1302         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1303           // Only one thread works in a team
1304           PetscInt p, q = Yi(i);
1305           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1306             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1307             if (Xj(p) == Yj(q)) {                        // Found it
1308               Ya(q) += alpha * Xa(p);
1309               q++;
1310             } else {
1311             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1312             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1313 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1314               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1315 #else
1316               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1317 #endif
1318             }
1319           }
1320         });
1321       });
1322     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1323   } else { // different nonzero patterns
1324     Mat             Z;
1325     KokkosCsrMatrix zcsr;
1326     KernelHandle    kh;
1327     kh.create_spadd_handle(true); // X, Y are sorted
1328     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1329     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1330     zkok = new Mat_SeqAIJKokkos(zcsr);
1331     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1332     PetscCall(MatHeaderReplace(Y, &Z));
1333     kh.destroy_spadd_handle();
1334   }
1335   PetscCall(PetscLogGpuTimeEnd());
1336   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1337   PetscFunctionReturn(PETSC_SUCCESS);
1338 }
1339 
1340 struct MatCOOStruct_SeqAIJKokkos {
1341   PetscCount           n;
1342   PetscCount           Atot;
1343   PetscInt             nz;
1344   PetscCountKokkosView jmap;
1345   PetscCountKokkosView perm;
1346 
1347   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1348   {
1349     nz   = coo_h->nz;
1350     n    = coo_h->n;
1351     Atot = coo_h->Atot;
1352     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1353     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1354   }
1355 };
1356 
1357 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1358 {
1359   PetscFunctionBegin;
1360   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1361   PetscFunctionReturn(PETSC_SUCCESS);
1362 }
1363 
1364 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1365 {
1366   Mat_SeqAIJKokkos          *akok;
1367   Mat_SeqAIJ                *aseq;
1368   PetscContainer             container_h;
1369   MatCOOStruct_SeqAIJ       *coo_h;
1370   MatCOOStruct_SeqAIJKokkos *coo_d;
1371 
1372   PetscFunctionBegin;
1373   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1374   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1375   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1376   delete akok;
1377   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1378   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1379 
1380   // Copy the COO struct to device
1381   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1382   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1383   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1384 
1385   // Put the COO struct in a container and then attach that to the matrix
1386   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1387   PetscFunctionReturn(PETSC_SUCCESS);
1388 }
1389 
1390 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1391 {
1392   MatScalarKokkosView        Aa;
1393   ConstMatScalarKokkosView   kv;
1394   PetscMemType               memtype;
1395   PetscContainer             container;
1396   MatCOOStruct_SeqAIJKokkos *coo;
1397 
1398   PetscFunctionBegin;
1399   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1400   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1401 
1402   const auto &n    = coo->n;
1403   const auto &Annz = coo->nz;
1404   const auto &jmap = coo->jmap;
1405   const auto &perm = coo->perm;
1406 
1407   PetscCall(PetscGetMemType(v, &memtype));
1408   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1409     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1410   } else {
1411     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1412   }
1413 
1414   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1415   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1416 
1417   PetscCall(PetscLogGpuTimeBegin());
1418   Kokkos::parallel_for(
1419     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1420       PetscScalar sum = 0.0;
1421       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1422       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1423     });
1424   PetscCall(PetscLogGpuTimeEnd());
1425 
1426   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1427   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1428   PetscFunctionReturn(PETSC_SUCCESS);
1429 }
1430 
1431 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1432 {
1433   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1434 
1435   PetscFunctionBegin;
1436   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1437   A->boundtocpu  = PETSC_FALSE;
1438 
1439   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1440   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1441   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1442   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1443   A->ops->scale                     = MatScale_SeqAIJKokkos;
1444   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1445   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1446   A->ops->mult                      = MatMult_SeqAIJKokkos;
1447   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1448   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1449   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1450   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1451   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1452   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1453   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1454   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1455   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1456   A->ops->shift                     = MatShift_SeqAIJKokkos;
1457   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1458   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1459   A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJKokkos;
1460   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1461   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1462   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1463   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1464   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1465   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1466   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1467 
1468   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1469   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1470 #if defined(PETSC_HAVE_HYPRE)
1471   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1472 #endif
1473   PetscFunctionReturn(PETSC_SUCCESS);
1474 }
1475 
1476 /*
1477    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1478 
1479   Input Parameters:
1480 +  A       - the MATSEQAIJKOKKOS matrix
1481 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1482 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1483 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1484 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1485 
1486   Output Parameter:
1487 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1488 */
1489 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1490 {
1491   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1492   PetscInt          nblocks = bs.extent(0) - 1;
1493 
1494   PetscFunctionBegin;
1495   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1496 
1497   // Pull out the diagonal blocks of the matrix and then invert the blocks
1498   auto Aa    = akok->a_dual.view_device();
1499   auto Ai    = akok->i_dual.view_device();
1500   auto Aj    = akok->j_dual.view_device();
1501   auto Adiag = akok->diag_dual.view_device();
1502   // TODO: how to tune the team size?
1503 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1504   auto ts = Kokkos::AUTO();
1505 #else
1506   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1507 #endif
1508   PetscCallCXX(Kokkos::parallel_for(
1509     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1510       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1511       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1512       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1513       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1514       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1515 
1516       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1517         PetscInt i = rstart + r;                                                            // i-th row in A
1518 
1519         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1520           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1521 
1522           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1523             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1524               B(r, c) = 0.0;
1525             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1526               B(r, c) = Aa(first + c);
1527             } else { // this entry does not show up in the CSR
1528               B(r, c) = 0.0;
1529             }
1530           }
1531         } else { // rare case that the diagonal does not exist
1532           const PetscInt begin = Ai(i);
1533           const PetscInt end   = Ai(i + 1);
1534           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1535           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1536             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1537             else if (Aj(j) >= rstart + m) break;
1538           }
1539         }
1540       });
1541 
1542       // LU-decompose B (w/o pivoting) and then invert B
1543       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1544       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1545     }));
1546   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1547   PetscFunctionReturn(PETSC_SUCCESS);
1548 }
1549 
1550 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1551 {
1552   Mat_SeqAIJ *aseq;
1553   PetscInt    i, m, n;
1554   auto        exec = PetscGetKokkosExecutionSpace();
1555 
1556   PetscFunctionBegin;
1557   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1558 
1559   m = akok->nrows();
1560   n = akok->ncols();
1561   PetscCall(MatSetSizes(A, m, n, m, n));
1562   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1563 
1564   /* Set up data structures of A as a MATSEQAIJ */
1565   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1566   aseq = (Mat_SeqAIJ *)A->data;
1567 
1568   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1569   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1570 
1571   aseq->i       = akok->i_host_data();
1572   aseq->j       = akok->j_host_data();
1573   aseq->a       = akok->a_host_data();
1574   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1575   aseq->free_a  = PETSC_FALSE;
1576   aseq->free_ij = PETSC_FALSE;
1577   aseq->nz      = akok->nnz();
1578   aseq->maxnz   = aseq->nz;
1579 
1580   PetscCall(PetscMalloc1(m, &aseq->imax));
1581   PetscCall(PetscMalloc1(m, &aseq->ilen));
1582   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1583 
1584   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1585   akok->nonzerostate = A->nonzerostate;
1586   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1587   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1588   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1589   PetscFunctionReturn(PETSC_SUCCESS);
1590 }
1591 
1592 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1593 {
1594   PetscFunctionBegin;
1595   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1596   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1597   PetscFunctionReturn(PETSC_SUCCESS);
1598 }
1599 
1600 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1601 {
1602   Mat_SeqAIJKokkos *akok;
1603 
1604   PetscFunctionBegin;
1605   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1606   PetscCall(MatCreate(comm, A));
1607   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1608   PetscFunctionReturn(PETSC_SUCCESS);
1609 }
1610 
1611 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1612 
1613    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1614  */
1615 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1616 {
1617   PetscFunctionBegin;
1618   PetscCall(MatCreate(comm, A));
1619   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1620   PetscFunctionReturn(PETSC_SUCCESS);
1621 }
1622 
1623 /*@C
1624   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1625   (the default parallel PETSc format). This matrix will ultimately be handled by
1626   Kokkos for calculations.
1627 
1628   Collective
1629 
1630   Input Parameters:
1631 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1632 . m    - number of rows
1633 . n    - number of columns
1634 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1635 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1636 
1637   Output Parameter:
1638 . A - the matrix
1639 
1640   Level: intermediate
1641 
1642   Notes:
1643   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1644   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1645   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1646 
1647   The AIJ format, also called
1648   compressed row storage, is fully compatible with standard Fortran
1649   storage.  That is, the stored row and column indices can begin at
1650   either one (as in Fortran) or zero.
1651 
1652   Specify the preallocated storage with either `nz` or `nnz` (not both).
1653   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1654   allocation.
1655 
1656 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1657 @*/
1658 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1659 {
1660   PetscFunctionBegin;
1661   PetscCall(PetscKokkosInitializeCheck());
1662   PetscCall(MatCreate(comm, A));
1663   PetscCall(MatSetSizes(*A, m, n, m, n));
1664   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1665   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1666   PetscFunctionReturn(PETSC_SUCCESS);
1667 }
1668 
1669 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1670 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1671 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1672 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1673 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1674 {
1675   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1676   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1677   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1678 
1679   PetscFunctionBegin;
1680   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1681     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1682     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1683     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1684   }
1685   PetscFunctionReturn(PETSC_SUCCESS);
1686 }
1687 
1688 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1689 {
1690   const PetscInt              n         = A->rmap->n;
1691   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1692   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1693   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1694 
1695   PetscFunctionBegin;
1696   if (!factors->transpose_updated) {
1697     if (has_upper) {
1698       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1699         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1700         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1701         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1702       }
1703 
1704       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1705         if (!factors->U) {
1706           Mat_SeqAIJ *seq;
1707 
1708           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1709           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1710 
1711           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1712           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1713           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1714           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1715         } else {
1716           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1717         }
1718         // Copy Ut from host to device
1719         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1720         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1721         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1722       } else { // If U was computed on device, we also compute the transpose there
1723         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1724         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1725                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1726                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1727                                                                                                                                                                                                                                factors->aUt_d));
1728         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1729       }
1730       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1731     }
1732 
1733     // do the same for L with LU
1734     if (has_lower) {
1735       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1736         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1737         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1738         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1739       }
1740 
1741       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1742         if (!factors->L) {
1743           Mat_SeqAIJ *seq;
1744 
1745           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1746           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1747 
1748           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1749           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1750           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1751           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1752         } else {
1753           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1754         }
1755         // Copy Lt from host to device
1756         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1757         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1758         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1759       } else { // If L was computed on device, we also compute the transpose there
1760         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1761         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1762                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1763                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1764                                                                                                                                                                                                                                factors->aLt_d));
1765         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1766       }
1767       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1768     }
1769 
1770     factors->transpose_updated = PETSC_TRUE;
1771   }
1772   PetscFunctionReturn(PETSC_SUCCESS);
1773 }
1774 
1775 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1776 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1777 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1778 {
1779   auto                        exec    = PetscGetKokkosExecutionSpace();
1780   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1781   PetscInt                    m       = A->rmap->n;
1782   PetscScalarKokkosView       D       = factors->D_d;
1783   PetscScalarKokkosView       X, Y, B; // alias
1784   ConstPetscScalarKokkosView  b;
1785   PetscScalarKokkosView       x;
1786   PetscIntKokkosView         &rowperm  = factors->rowperm;
1787   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1788 
1789   PetscFunctionBegin;
1790   PetscCall(PetscLogGpuTimeBegin());
1791   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1792   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1793   PetscCall(VecGetKokkosView(bb, &b));
1794   PetscCall(VecGetKokkosViewWrite(xx, &x));
1795 
1796   // Solve U^T Y = B
1797   if (identity) { // Reorder b with the row permutation
1798     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1799     Y = factors->workVector;
1800   } else {
1801     B = factors->workVector;
1802     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1803     Y = x;
1804   }
1805   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1806 
1807   // Solve diag(D) Y' = Y.
1808   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1809   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1810 
1811   // Solve UX = Y
1812   if (identity) {
1813     X = x;
1814   } else {
1815     X = factors->workVector; // B is not needed anymore
1816   }
1817   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1818 
1819   // Reorder X with the inverse column (row) permutation
1820   if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1821 
1822   PetscCall(VecRestoreKokkosView(bb, &b));
1823   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1824   PetscCall(PetscLogGpuTimeEnd());
1825   PetscFunctionReturn(PETSC_SUCCESS);
1826 }
1827 
1828 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1829 // R and C are represented by rowperm and colperm in factors.
1830 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1831 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1832 {
1833   auto                        exec    = PetscGetKokkosExecutionSpace();
1834   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1835   PetscInt                    m       = A->rmap->n;
1836   PetscScalarKokkosView       X, Y, B; // alias
1837   ConstPetscScalarKokkosView  b;
1838   PetscScalarKokkosView       x;
1839   PetscIntKokkosView         &rowperm      = factors->rowperm;
1840   PetscIntKokkosView         &colperm      = factors->colperm;
1841   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1842   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1843 
1844   PetscFunctionBegin;
1845   PetscCall(PetscLogGpuTimeBegin());
1846   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1847   PetscCall(VecGetKokkosView(bb, &b));
1848   PetscCall(VecGetKokkosViewWrite(xx, &x));
1849 
1850   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1851   if (row_identity) {
1852     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1853     Y = factors->workVector;
1854   } else {
1855     B = factors->workVector;
1856     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1857     Y = x;
1858   }
1859   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1860 
1861   // Solve U C^- x = Y
1862   if (col_identity) {
1863     X = x;
1864   } else {
1865     X = factors->workVector;
1866   }
1867   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1868 
1869   // x = C X; Reorder X with the inverse col permutation
1870   if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1871 
1872   PetscCall(VecRestoreKokkosView(bb, &b));
1873   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1874   PetscCall(PetscLogGpuTimeEnd());
1875   PetscFunctionReturn(PETSC_SUCCESS);
1876 }
1877 
1878 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1879 // R and C are represented by rowperm and colperm in factors.
1880 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1881 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1882 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1883 {
1884   auto                        exec    = PetscGetKokkosExecutionSpace();
1885   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1886   PetscInt                    m       = A->rmap->n;
1887   PetscScalarKokkosView       X, Y, B; // alias
1888   ConstPetscScalarKokkosView  b;
1889   PetscScalarKokkosView       x;
1890   PetscIntKokkosView         &rowperm      = factors->rowperm;
1891   PetscIntKokkosView         &colperm      = factors->colperm;
1892   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1893   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1894 
1895   PetscFunctionBegin;
1896   PetscCall(PetscLogGpuTimeBegin());
1897   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1898   PetscCall(VecGetKokkosView(bb, &b));
1899   PetscCall(VecGetKokkosViewWrite(xx, &x));
1900 
1901   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1902   if (col_identity) { // Reorder b with the col permutation
1903     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1904     Y = factors->workVector;
1905   } else {
1906     B = factors->workVector;
1907     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1908     Y = x;
1909   }
1910   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1911 
1912   // Solve L^T X = Y
1913   if (row_identity) {
1914     X = x;
1915   } else {
1916     X = factors->workVector;
1917   }
1918   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1919 
1920   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1921   if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1922 
1923   PetscCall(VecRestoreKokkosView(bb, &b));
1924   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1925   PetscCall(PetscLogGpuTimeEnd());
1926   PetscFunctionReturn(PETSC_SUCCESS);
1927 }
1928 
1929 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1930 {
1931   PetscFunctionBegin;
1932   PetscCall(MatSeqAIJKokkosSyncHost(A));
1933   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1934 
1935   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1936     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1937     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1938     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1939     const MatScalar            *Ba = b->a;
1940     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1941 
1942     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1943       // Allocate memory and copy the structure
1944       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1945       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1946       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1947       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1948       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1949       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1950 
1951       PetscInt *Li = factors->iL_h.data();
1952       PetscInt *Lj = factors->jL_h.data();
1953       PetscInt *Ui = factors->iU_h.data();
1954       PetscInt *Uj = factors->jU_h.data();
1955 
1956       Li[0] = Ui[0] = 0;
1957       for (PetscInt i = 0; i < m; i++) {
1958         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1959         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1960 
1961         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1962         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1963 
1964         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1965         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1966 
1967         Li[i + 1] = Li[i] + llen + 1;
1968         Ui[i + 1] = Ui[i] + ulen;
1969       }
1970 
1971       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1972       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1973       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1974       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1975       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1976       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1977 
1978       // Copy row/col permutation to device
1979       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1980       PetscBool row_identity;
1981       PetscCall(ISIdentity(rowperm, &row_identity));
1982       if (!row_identity) {
1983         const PetscInt *ip;
1984 
1985         PetscCall(ISGetIndices(rowperm, &ip));
1986         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1987         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1988         PetscCall(ISRestoreIndices(rowperm, &ip));
1989         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1990       }
1991 
1992       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1993       PetscBool col_identity;
1994       PetscCall(ISIdentity(colperm, &col_identity));
1995       if (!col_identity) {
1996         const PetscInt *ip;
1997 
1998         PetscCall(ISGetIndices(colperm, &ip));
1999         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2000         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2001         PetscCall(ISRestoreIndices(colperm, &ip));
2002         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2003       }
2004 
2005       /* Create sptrsv handles for L, U and their transpose */
2006 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2007       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2008 #else
2009       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2010 #endif
2011       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2012       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2013       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2014       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2015     }
2016 
2017     // Copy the value
2018     for (PetscInt i = 0; i < m; i++) {
2019       PetscInt        llen = Bi[i + 1] - Bi[i];
2020       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2021       const PetscInt *Li   = factors->iL_h.data();
2022       const PetscInt *Ui   = factors->iU_h.data();
2023 
2024       PetscScalar *La = factors->aL_h.data();
2025       PetscScalar *Ua = factors->aU_h.data();
2026 
2027       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2028       La[Li[i] + llen] = 1.0;                      // diagonal entry
2029 
2030       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2031       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2032     }
2033 
2034     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2035     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2036     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2037     factors->transpose_updated         = PETSC_FALSE;
2038     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2039 
2040     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2041     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2042   }
2043 
2044   B->ops->matsolve          = NULL;
2045   B->ops->matsolvetranspose = NULL;
2046   PetscFunctionReturn(PETSC_SUCCESS);
2047 }
2048 
2049 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2050 {
2051   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2052   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2053   PetscInt                    fill_lev = info->levels;
2054 
2055   PetscFunctionBegin;
2056   PetscCall(PetscLogGpuTimeBegin());
2057   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2058   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2059 
2060   auto a_d = aijkok->a_dual.view_device();
2061   auto i_d = aijkok->i_dual.view_device();
2062   auto j_d = aijkok->j_dual.view_device();
2063 
2064   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2065 
2066   B->assembled              = PETSC_TRUE;
2067   B->preallocated           = PETSC_TRUE;
2068   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2069   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2070   B->ops->matsolve          = NULL;
2071   B->ops->matsolvetranspose = NULL;
2072 
2073   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2074   factors->transpose_updated         = PETSC_FALSE;
2075   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2076   /* TODO: log flops, but how to know that? */
2077   PetscCall(PetscLogGpuTimeEnd());
2078   PetscFunctionReturn(PETSC_SUCCESS);
2079 }
2080 
2081 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2082 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2083 {
2084   Mat_SeqAIJKokkos           *aijkok;
2085   Mat_SeqAIJ                 *b;
2086   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2087   PetscInt                    fill_lev = info->levels;
2088   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2089   PetscInt                    n        = A->rmap->n;
2090 
2091   PetscFunctionBegin;
2092   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2093   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2094 
2095   /* Create a spiluk handle and then do symbolic factorization */
2096   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2097   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2098 
2099   auto spiluk_handle = factors->kh.get_spiluk_handle();
2100 
2101   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2102   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2103   Kokkos::realloc(factors->iU_d, n + 1);
2104   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2105 
2106   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2107   auto i_d = aijkok->i_dual.view_device();
2108   auto j_d = aijkok->j_dual.view_device();
2109   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2110   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2111 
2112   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2113   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2114   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2115   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2116 
2117   /* TODO: add options to select sptrsv algorithms */
2118   /* Create sptrsv handles for L, U and their transpose */
2119 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2120   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2121 #else
2122   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2123 #endif
2124 
2125   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2126   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2127   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2128   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2129 
2130   /* Fill fields of the factor matrix B */
2131   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2132   b     = (Mat_SeqAIJ *)B->data;
2133   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2134   B->info.fill_ratio_given  = info->fill;
2135   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2136 
2137   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2138   PetscFunctionReturn(PETSC_SUCCESS);
2139 }
2140 
2141 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2142 {
2143   PetscFunctionBegin;
2144   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2145   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2146   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2147   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2148   PetscFunctionReturn(PETSC_SUCCESS);
2149 }
2150 
2151 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2152 {
2153   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2154 
2155   PetscFunctionBegin;
2156   if (!info->factoronhost) {
2157     PetscCall(ISIdentity(isrow, &row_identity));
2158     PetscCall(ISIdentity(iscol, &col_identity));
2159   }
2160 
2161   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2162   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2163 
2164   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2165     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2166   } else {
2167     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2168     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2169   }
2170   PetscFunctionReturn(PETSC_SUCCESS);
2171 }
2172 
2173 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2174 {
2175   PetscFunctionBegin;
2176   PetscCall(MatSeqAIJKokkosSyncHost(A));
2177   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2178 
2179   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2180     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2181     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2182     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2183     const MatScalar            *Ba = b->a;
2184     PetscInt                    m  = B->rmap->n;
2185 
2186     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2187       // Allocate memory and copy the structure
2188       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2189       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2190       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2191       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2192       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2193       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2194 
2195       // Build jU_h from the skewed Aj
2196       PetscInt *Uj = factors->jU_h.data();
2197       for (PetscInt i = 0; i < m; i++) {
2198         PetscInt ulen = Bi[i + 1] - Bi[i];
2199         Uj[Bi[i]]     = i;                                              // diagonal entry
2200         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2201       }
2202 
2203       // Copy iU, jU to device
2204       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2205       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2206 
2207       // Copy row/col permutation to device
2208       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2209       PetscBool row_identity;
2210       PetscCall(ISIdentity(rowperm, &row_identity));
2211       if (!row_identity) {
2212         const PetscInt *ip;
2213 
2214         PetscCall(ISGetIndices(rowperm, &ip));
2215         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2216         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2217         PetscCall(ISRestoreIndices(rowperm, &ip));
2218         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2219       }
2220 
2221       // Create sptrsv handles for U and U^T
2222 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2223       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2224 #else
2225       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2226 #endif
2227       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2228       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2229     }
2230     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2231     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2232     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2233 
2234     // Copy the value
2235     PetscScalar *Ua = factors->aU_h.data();
2236     PetscScalar *D  = factors->D_h.data();
2237     for (PetscInt i = 0; i < m; i++) {
2238       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2239       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2240       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2241     }
2242     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2243     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2244 
2245     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2246     factors->transpose_updated         = PETSC_FALSE;
2247   }
2248 
2249   B->ops->matsolve          = NULL;
2250   B->ops->matsolvetranspose = NULL;
2251   PetscFunctionReturn(PETSC_SUCCESS);
2252 }
2253 
2254 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2255 {
2256   PetscFunctionBegin;
2257   if (info->solveonhost) {
2258     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2259     PetscCall(MatSetType(B, MATSEQSBAIJ));
2260     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2261   }
2262 
2263   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2264 
2265   if (!info->solveonhost) {
2266     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2267     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2268     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2269     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2270   }
2271   PetscFunctionReturn(PETSC_SUCCESS);
2272 }
2273 
2274 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2275 {
2276   PetscFunctionBegin;
2277   if (info->solveonhost) {
2278     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2279     PetscCall(MatSetType(B, MATSEQSBAIJ));
2280     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2281   }
2282 
2283   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2284 
2285   if (!info->solveonhost) {
2286     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2287     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2288     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2289     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2290   }
2291   PetscFunctionReturn(PETSC_SUCCESS);
2292 }
2293 
2294 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2295 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2296 {
2297   PetscFunctionBegin;
2298   *type = MATSOLVERKOKKOS;
2299   PetscFunctionReturn(PETSC_SUCCESS);
2300 }
2301 
2302 /*MC
2303   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2304   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2305 
2306   Level: beginner
2307 
2308 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2309 M*/
2310 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2311 {
2312   PetscInt n = A->rmap->n;
2313   MPI_Comm comm;
2314 
2315   PetscFunctionBegin;
2316   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2317   PetscCall(MatCreate(comm, B));
2318   PetscCall(MatSetSizes(*B, n, n, n, n));
2319   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2320   (*B)->factortype = ftype;
2321   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2322   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2323   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2324 
2325   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2326     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2327     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2328     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2329     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2330     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2331   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2332     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2333     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2334     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2335     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2336   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2337 
2338   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2339   (*B)->canuseordering = PETSC_TRUE;
2340   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2341   PetscFunctionReturn(PETSC_SUCCESS);
2342 }
2343 
2344 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2345 {
2346   PetscFunctionBegin;
2347   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2348   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2349   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2350   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2351   PetscFunctionReturn(PETSC_SUCCESS);
2352 }
2353 
2354 /* Utility to print out a KokkosCsrMatrix for debugging */
2355 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2356 {
2357   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2358   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2359   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2360   const PetscInt    *i  = iv.data();
2361   const PetscInt    *j  = jv.data();
2362   const PetscScalar *a  = av.data();
2363   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2364 
2365   PetscFunctionBegin;
2366   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2367   for (PetscInt k = 0; k < m; k++) {
2368     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2369     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2370     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2371   }
2372   PetscFunctionReturn(PETSC_SUCCESS);
2373 }
2374