xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision f3d3cd90648576fafae91a24e2611daaef7bcd2e)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20 #include <KokkosSparse_spmv.hpp>
21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22 
23 #include <KokkosSparse_spiluk.hpp>
24 #include <KokkosSparse_sptrsv.hpp>
25 #include <KokkosSparse_spgemm.hpp>
26 #include <KokkosSparse_spadd.hpp>
27 #include <KokkosBatched_LU_Decl.hpp>
28 #include <KokkosBatched_InverseLU_Decl.hpp>
29 
30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31 
32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33   #include <KokkosSparse_Utils.hpp>
34 using KokkosSparse::sort_crs_matrix;
35 using KokkosSparse::Impl::transpose_matrix;
36 #else
37   #include <KokkosKernels_Sorting.hpp>
38 using KokkosKernels::sort_crs_matrix;
39 using KokkosKernels::Impl::transpose_matrix;
40 #endif
41 
42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43 using KokkosSparse::spiluk_symbolic;
44 using KokkosSparse::spiluk_numeric;
45 using KokkosSparse::sptrsv_symbolic;
46 using KokkosSparse::sptrsv_solve;
47 using KokkosSparse::Experimental::SPTRSVAlgorithm;
48 using KokkosSparse::Experimental::SPILUKAlgorithm;
49 #else
50 using KokkosSparse::Experimental::spiluk_symbolic;
51 using KokkosSparse::Experimental::spiluk_numeric;
52 using KokkosSparse::Experimental::sptrsv_symbolic;
53 using KokkosSparse::Experimental::sptrsv_solve;
54 using KokkosSparse::Experimental::SPTRSVAlgorithm;
55 using KokkosSparse::Experimental::SPILUKAlgorithm;
56 #endif
57 
58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59 
60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62    In the latter case, it is important to set a_dual's sync state correctly.
63  */
64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65 {
66   Mat_SeqAIJ       *aijseq;
67   Mat_SeqAIJKokkos *aijkok;
68 
69   PetscFunctionBegin;
70   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72 
73   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75 
76   /* If aijkok does not exist, we just copy i, j to device.
77      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78      In both cases, we build a new aijkok structure.
79   */
80   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81     delete aijkok;
82     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
83     A->spptr = aijkok;
84   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
85     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
86     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
87     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
88   }
89   PetscFunctionReturn(PETSC_SUCCESS);
90 }
91 
92 /* Sync CSR data to device if not yet */
93 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
94 {
95   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
96 
97   PetscFunctionBegin;
98   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
99   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
100   if (aijkok->a_dual.need_sync_device()) {
101     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
102     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
103     aijkok->hermitian_updated = PETSC_FALSE;
104   }
105   PetscFunctionReturn(PETSC_SUCCESS);
106 }
107 
108 /* Mark the CSR data on device as modified */
109 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
110 {
111   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
112 
113   PetscFunctionBegin;
114   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
115   aijkok->a_dual.clear_sync_state();
116   aijkok->a_dual.modify_device();
117   aijkok->transpose_updated = PETSC_FALSE;
118   aijkok->hermitian_updated = PETSC_FALSE;
119   PetscCall(MatSeqAIJInvalidateDiagonal(A));
120   PetscCall(PetscObjectStateIncrease((PetscObject)A));
121   PetscFunctionReturn(PETSC_SUCCESS);
122 }
123 
124 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127   auto              exec   = PetscGetKokkosExecutionSpace();
128 
129   PetscFunctionBegin;
130   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
131   /* We do not expect one needs factors on host  */
132   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
133   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
134   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
135   PetscFunctionReturn(PETSC_SUCCESS);
136 }
137 
138 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
139 {
140   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
141 
142   PetscFunctionBegin;
143   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
144     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
145     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
146     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
147   */
148   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
150     *array = aijkok->a_dual.view_host().data();
151   } else { /* Happens when calling MatSetValues on a newly created matrix */
152     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
153   }
154   PetscFunctionReturn(PETSC_SUCCESS);
155 }
156 
157 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
158 {
159 #if !defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
160   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
161 #endif
162 
163   PetscFunctionBegin;
164 #if !defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
165   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
166 #endif
167   PetscFunctionReturn(PETSC_SUCCESS);
168 }
169 
170 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
171 {
172   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
173 
174   PetscFunctionBegin;
175   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
176     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
177     *array = aijkok->a_dual.view_host().data();
178   } else {
179     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
180   }
181   PetscFunctionReturn(PETSC_SUCCESS);
182 }
183 
184 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
185 {
186   PetscFunctionBegin;
187   *array = NULL;
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
192 {
193   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
194 
195   PetscFunctionBegin;
196   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
197     *array = aijkok->a_dual.view_host().data();
198   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
199     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
200   }
201   PetscFunctionReturn(PETSC_SUCCESS);
202 }
203 
204 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
205 {
206   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
207 
208   PetscFunctionBegin;
209 #if !defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
210   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
211     aijkok->a_dual.clear_sync_state();
212     aijkok->a_dual.modify_host();
213   }
214 #else
215   (void)aijkok;
216 #endif
217   PetscFunctionReturn(PETSC_SUCCESS);
218 }
219 
220 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
221 {
222   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
223 
224   PetscFunctionBegin;
225   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
226 
227   if (i) *i = aijkok->i_device_data();
228   if (j) *j = aijkok->j_device_data();
229   if (a) {
230     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
231     *a = aijkok->a_device_data();
232   }
233   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
234   PetscFunctionReturn(PETSC_SUCCESS);
235 }
236 
237 /*
238   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
239 
240   Input Parameter:
241 .  A       - the MATSEQAIJKOKKOS matrix
242 
243   Output Parameters:
244 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
245 -  T_d    - the transpose on device, whose value array is allocated but not initialized
246 */
247 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
248 {
249   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
250   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
251   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
252   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
253   MatRowMapType          *Ti = Ti_h.data();
254   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
255   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
256   PetscInt               *Tj   = Tj_h.data();
257   PetscInt               *perm = perm_h.data();
258   PetscInt               *offset;
259 
260   PetscFunctionBegin;
261   // Populate Ti
262   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
263   Ti++;
264   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
265   Ti--;
266   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
267 
268   // Populate Tj and the permutation array
269   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
270   for (PetscInt i = 0; i < m; i++) {
271     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
272       PetscInt r    = Aj[j];                       // row r of T
273       PetscInt disp = Ti[r] + offset[r];
274 
275       Tj[disp]   = i; // col i of T
276       perm[disp] = j;
277       offset[r]++;
278     }
279   }
280   PetscCall(PetscFree(offset));
281 
282   // Sort each row of T, along with the permutation array
283   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
284 
285   // Output perm and T on device
286   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
287   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
288   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
289   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
290   PetscFunctionReturn(PETSC_SUCCESS);
291 }
292 
293 // Generate the transpose on device and cache it internally
294 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
295 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
296 {
297   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
298   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
299   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
300   KokkosCsrMatrix  &T = akok->csrmatT;
301 
302   PetscFunctionBegin;
303   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
304   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
305 
306   const auto &Aa = akok->a_dual.view_device();
307 
308   if (A->symmetric == PETSC_BOOL3_TRUE) {
309     *csrmatT = akok->csrmat;
310   } else {
311     // See if we already have a cached transpose and its value is up to date
312     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
313       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
314         const auto &perm = akok->transpose_perm; // get the permutation array
315         auto       &Ta   = T.values;
316 
317         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
318       }
319     } else { // Generate T of size n x m for the first time
320       MatRowMapKokkosView perm;
321 
322       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
323       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
324       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
325     }
326     akok->transpose_updated = PETSC_TRUE;
327     *csrmatT                = akok->csrmatT;
328   }
329   PetscFunctionReturn(PETSC_SUCCESS);
330 }
331 
332 // Generate the Hermitian on device and cache it internally
333 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
334 {
335   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
336   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
337   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
338   KokkosCsrMatrix  &T = akok->csrmatH;
339 
340   PetscFunctionBegin;
341   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
342   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
343 
344   const auto &Aa = akok->a_dual.view_device();
345 
346   if (A->hermitian == PETSC_BOOL3_TRUE) {
347     *csrmatH = akok->csrmat;
348   } else {
349     // See if we already have a cached hermitian and its value is up to date
350     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
351       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
352         const auto &perm = akok->transpose_perm; // get the permutation array
353         auto       &Ta   = T.values;
354 
355         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
356       }
357     } else { // Generate T of size n x m for the first time
358       MatRowMapKokkosView perm;
359 
360       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
361       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
362       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
363     }
364     akok->hermitian_updated = PETSC_TRUE;
365     *csrmatH                = akok->csrmatH;
366   }
367   PetscFunctionReturn(PETSC_SUCCESS);
368 }
369 
370 /* y = A x */
371 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
372 {
373   Mat_SeqAIJKokkos          *aijkok;
374   ConstPetscScalarKokkosView xv;
375   PetscScalarKokkosView      yv;
376 
377   PetscFunctionBegin;
378   PetscCall(PetscLogGpuTimeBegin());
379   PetscCall(MatSeqAIJKokkosSyncDevice(A));
380   PetscCall(VecGetKokkosView(xx, &xv));
381   PetscCall(VecGetKokkosViewWrite(yy, &yv));
382   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
383   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
384   PetscCall(VecRestoreKokkosView(xx, &xv));
385   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
386   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
387   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
388   PetscCall(PetscLogGpuTimeEnd());
389   PetscFunctionReturn(PETSC_SUCCESS);
390 }
391 
392 /* y = A^T x */
393 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
394 {
395   Mat_SeqAIJKokkos          *aijkok;
396   const char                *mode;
397   ConstPetscScalarKokkosView xv;
398   PetscScalarKokkosView      yv;
399   KokkosCsrMatrix            csrmat;
400 
401   PetscFunctionBegin;
402   PetscCall(PetscLogGpuTimeBegin());
403   PetscCall(MatSeqAIJKokkosSyncDevice(A));
404   PetscCall(VecGetKokkosView(xx, &xv));
405   PetscCall(VecGetKokkosViewWrite(yy, &yv));
406   if (A->form_explicit_transpose) {
407     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
408     mode = "N";
409   } else {
410     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
411     csrmat = aijkok->csrmat;
412     mode   = "T";
413   }
414   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
415   PetscCall(VecRestoreKokkosView(xx, &xv));
416   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
417   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
418   PetscCall(PetscLogGpuTimeEnd());
419   PetscFunctionReturn(PETSC_SUCCESS);
420 }
421 
422 /* y = A^H x */
423 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
424 {
425   Mat_SeqAIJKokkos          *aijkok;
426   const char                *mode;
427   ConstPetscScalarKokkosView xv;
428   PetscScalarKokkosView      yv;
429   KokkosCsrMatrix            csrmat;
430 
431   PetscFunctionBegin;
432   PetscCall(PetscLogGpuTimeBegin());
433   PetscCall(MatSeqAIJKokkosSyncDevice(A));
434   PetscCall(VecGetKokkosView(xx, &xv));
435   PetscCall(VecGetKokkosViewWrite(yy, &yv));
436   if (A->form_explicit_transpose) {
437     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
438     mode = "N";
439   } else {
440     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
441     csrmat = aijkok->csrmat;
442     mode   = "C";
443   }
444   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
445   PetscCall(VecRestoreKokkosView(xx, &xv));
446   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
447   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
448   PetscCall(PetscLogGpuTimeEnd());
449   PetscFunctionReturn(PETSC_SUCCESS);
450 }
451 
452 /* z = A x + y */
453 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
454 {
455   Mat_SeqAIJKokkos          *aijkok;
456   ConstPetscScalarKokkosView xv;
457   PetscScalarKokkosView      zv;
458 
459   PetscFunctionBegin;
460   PetscCall(PetscLogGpuTimeBegin());
461   PetscCall(MatSeqAIJKokkosSyncDevice(A));
462   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
463   PetscCall(VecGetKokkosView(xx, &xv));
464   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
465   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
466   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
467   PetscCall(VecRestoreKokkosView(xx, &xv));
468   PetscCall(VecRestoreKokkosView(zz, &zv));
469   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
470   PetscCall(PetscLogGpuTimeEnd());
471   PetscFunctionReturn(PETSC_SUCCESS);
472 }
473 
474 /* z = A^T x + y */
475 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
476 {
477   Mat_SeqAIJKokkos          *aijkok;
478   const char                *mode;
479   ConstPetscScalarKokkosView xv;
480   PetscScalarKokkosView      zv;
481   KokkosCsrMatrix            csrmat;
482 
483   PetscFunctionBegin;
484   PetscCall(PetscLogGpuTimeBegin());
485   PetscCall(MatSeqAIJKokkosSyncDevice(A));
486   if (zz != yy) PetscCall(VecCopy(yy, zz));
487   PetscCall(VecGetKokkosView(xx, &xv));
488   PetscCall(VecGetKokkosView(zz, &zv));
489   if (A->form_explicit_transpose) {
490     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
491     mode = "N";
492   } else {
493     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
494     csrmat = aijkok->csrmat;
495     mode   = "T";
496   }
497   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
498   PetscCall(VecRestoreKokkosView(xx, &xv));
499   PetscCall(VecRestoreKokkosView(zz, &zv));
500   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
501   PetscCall(PetscLogGpuTimeEnd());
502   PetscFunctionReturn(PETSC_SUCCESS);
503 }
504 
505 /* z = A^H x + y */
506 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
507 {
508   Mat_SeqAIJKokkos          *aijkok;
509   const char                *mode;
510   ConstPetscScalarKokkosView xv;
511   PetscScalarKokkosView      zv;
512   KokkosCsrMatrix            csrmat;
513 
514   PetscFunctionBegin;
515   PetscCall(PetscLogGpuTimeBegin());
516   PetscCall(MatSeqAIJKokkosSyncDevice(A));
517   if (zz != yy) PetscCall(VecCopy(yy, zz));
518   PetscCall(VecGetKokkosView(xx, &xv));
519   PetscCall(VecGetKokkosView(zz, &zv));
520   if (A->form_explicit_transpose) {
521     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
522     mode = "N";
523   } else {
524     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
525     csrmat = aijkok->csrmat;
526     mode   = "C";
527   }
528   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
529   PetscCall(VecRestoreKokkosView(xx, &xv));
530   PetscCall(VecRestoreKokkosView(zz, &zv));
531   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
532   PetscCall(PetscLogGpuTimeEnd());
533   PetscFunctionReturn(PETSC_SUCCESS);
534 }
535 
536 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
537 {
538   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
539 
540   PetscFunctionBegin;
541   switch (op) {
542   case MAT_FORM_EXPLICIT_TRANSPOSE:
543     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
544     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
545     A->form_explicit_transpose = flg;
546     break;
547   default:
548     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
549     break;
550   }
551   PetscFunctionReturn(PETSC_SUCCESS);
552 }
553 
554 /* Depending on reuse, either build a new mat, or use the existing mat */
555 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
556 {
557   Mat_SeqAIJ *aseq;
558 
559   PetscFunctionBegin;
560   PetscCall(PetscKokkosInitializeCheck());
561   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
562     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
563   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
564     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
565   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
566     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
567     PetscCall(PetscFree(A->defaultvectype));
568     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
569     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
570     PetscCall(MatSetOps_SeqAIJKokkos(A));
571     aseq = static_cast<Mat_SeqAIJ *>(A->data);
572     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
573       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
574       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
575     }
576   }
577   PetscFunctionReturn(PETSC_SUCCESS);
578 }
579 
580 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
581    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
582  */
583 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
584 {
585   Mat_SeqAIJ       *bseq;
586   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
587   Mat               mat;
588 
589   PetscFunctionBegin;
590   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
591   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
592   mat = *B;
593   if (A->assembled) {
594     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
595     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
596     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
597     /* Now copy values to B if needed */
598     if (dupOption == MAT_COPY_VALUES) {
599       if (akok->a_dual.need_sync_device()) {
600         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
601         bkok->a_dual.modify_host();
602       } else { /* If device has the latest data, we only copy data on device */
603         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
604         bkok->a_dual.modify_device();
605       }
606     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
607       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
608       bkok->a_dual.modify_host();
609     }
610     mat->spptr = bkok;
611   }
612 
613   PetscCall(PetscFree(mat->defaultvectype));
614   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
615   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
616   PetscCall(MatSetOps_SeqAIJKokkos(mat));
617   PetscFunctionReturn(PETSC_SUCCESS);
618 }
619 
620 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
621 {
622   Mat               At;
623   KokkosCsrMatrix   internT;
624   Mat_SeqAIJKokkos *atkok, *bkok;
625 
626   PetscFunctionBegin;
627   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
628   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
629   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
630     /* Deep copy internT, as we want to isolate the internal transpose */
631     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
632     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
633     if (reuse == MAT_INITIAL_MATRIX) *B = At;
634     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
635   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
636     if ((*B)->assembled) {
637       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
638       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
639       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
640     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
641       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
642       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
643       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
644       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
645       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
646     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
647   }
648   PetscFunctionReturn(PETSC_SUCCESS);
649 }
650 
651 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
652 {
653   Mat_SeqAIJKokkos *aijkok;
654 
655   PetscFunctionBegin;
656   if (A->factortype == MAT_FACTOR_NONE) {
657     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
658     delete aijkok;
659   } else {
660     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
661   }
662   A->spptr = NULL;
663   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
664   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
665   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
666 #if defined(PETSC_HAVE_HYPRE)
667   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
668 #endif
669   PetscCall(MatDestroy_SeqAIJ(A));
670   PetscFunctionReturn(PETSC_SUCCESS);
671 }
672 
673 /*MC
674    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
675 
676    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
677 
678    Options Database Key:
679 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
680 
681   Level: beginner
682 
683 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
684 M*/
685 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
686 {
687   PetscFunctionBegin;
688   PetscCall(PetscKokkosInitializeCheck());
689   PetscCall(MatCreate_SeqAIJ(A));
690   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
691   PetscFunctionReturn(PETSC_SUCCESS);
692 }
693 
694 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
695 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
696 {
697   Mat_SeqAIJ         *a, *b;
698   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
699   MatScalarKokkosView aa, ba, ca;
700   MatRowMapKokkosView ai, bi, ci;
701   MatColIdxKokkosView aj, bj, cj;
702   PetscInt            m, n, nnz, aN;
703 
704   PetscFunctionBegin;
705   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
706   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
707   PetscAssertPointer(C, 4);
708   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
709   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
710   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
711   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
712 
713   PetscCall(MatSeqAIJKokkosSyncDevice(A));
714   PetscCall(MatSeqAIJKokkosSyncDevice(B));
715   a    = static_cast<Mat_SeqAIJ *>(A->data);
716   b    = static_cast<Mat_SeqAIJ *>(B->data);
717   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
718   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
719   aa   = akok->a_dual.view_device();
720   ai   = akok->i_dual.view_device();
721   ba   = bkok->a_dual.view_device();
722   bi   = bkok->i_dual.view_device();
723   m    = A->rmap->n; /* M, N and nnz of C */
724   n    = A->cmap->n + B->cmap->n;
725   nnz  = a->nz + b->nz;
726   aN   = A->cmap->n; /* N of A */
727   if (reuse == MAT_INITIAL_MATRIX) {
728     aj           = akok->j_dual.view_device();
729     bj           = bkok->j_dual.view_device();
730     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
731     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
732     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
733     ca           = ca_dual.view_device();
734     ci           = ci_dual.view_device();
735     cj           = cj_dual.view_device();
736 
737     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
738     Kokkos::parallel_for(
739       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
740         PetscInt i       = t.league_rank(); /* row i */
741         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
742 
743         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
744                                                    ci(i) = coffset;
745                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
746         });
747 
748         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
749           if (k < alen) {
750             ca(coffset + k) = aa(ai(i) + k);
751             cj(coffset + k) = aj(ai(i) + k);
752           } else {
753             ca(coffset + k) = ba(bi(i) + k - alen);
754             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
755           }
756         });
757       });
758     ca_dual.modify_device();
759     ci_dual.modify_device();
760     cj_dual.modify_device();
761     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
762     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
763   } else if (reuse == MAT_REUSE_MATRIX) {
764     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
765     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
766     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
767     ca   = ckok->a_dual.view_device();
768     ci   = ckok->i_dual.view_device();
769 
770     Kokkos::parallel_for(
771       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
772         PetscInt i    = t.league_rank(); /* row i */
773         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
774         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
775           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
776           else ca(ci(i) + k) = ba(bi(i) + k - alen);
777         });
778       });
779     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
780   }
781   PetscFunctionReturn(PETSC_SUCCESS);
782 }
783 
784 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
785 {
786   PetscFunctionBegin;
787   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
788   PetscFunctionReturn(PETSC_SUCCESS);
789 }
790 
791 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
792 {
793   Mat_Product                 *product = C->product;
794   Mat                          A, B;
795   bool                         transA, transB; /* use bool, since KK needs this type */
796   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
797   Mat_SeqAIJ                  *c;
798   MatProductData_SeqAIJKokkos *pdata;
799   KokkosCsrMatrix              csrmatA, csrmatB;
800 
801   PetscFunctionBegin;
802   MatCheckProduct(C, 1);
803   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
804   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
805 
806   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
807   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
808   // we still do numeric.
809   if (pdata->reusesym) { // numeric reuses results from symbolic
810     pdata->reusesym = PETSC_FALSE;
811     PetscFunctionReturn(PETSC_SUCCESS);
812   }
813 
814   switch (product->type) {
815   case MATPRODUCT_AB:
816     transA = false;
817     transB = false;
818     break;
819   case MATPRODUCT_AtB:
820     transA = true;
821     transB = false;
822     break;
823   case MATPRODUCT_ABt:
824     transA = false;
825     transB = true;
826     break;
827   default:
828     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
829   }
830 
831   A = product->A;
832   B = product->B;
833   PetscCall(MatSeqAIJKokkosSyncDevice(A));
834   PetscCall(MatSeqAIJKokkosSyncDevice(B));
835   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
836   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
837   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
838 
839   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
840 
841   csrmatA = akok->csrmat;
842   csrmatB = bkok->csrmat;
843 
844   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
845   if (transA) {
846     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
847     transA = false;
848   }
849 
850   if (transB) {
851     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
852     transB = false;
853   }
854   PetscCall(PetscLogGpuTimeBegin());
855   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
856 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
857   auto spgemmHandle = pdata->kh.get_spgemm_handle();
858   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
859 #endif
860 
861   PetscCall(PetscLogGpuTimeEnd());
862   PetscCall(MatSeqAIJKokkosModifyDevice(C));
863   /* shorter version of MatAssemblyEnd_SeqAIJ */
864   c = (Mat_SeqAIJ *)C->data;
865   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
866   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
867   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
868   c->reallocs         = 0;
869   C->info.mallocs     = 0;
870   C->info.nz_unneeded = 0;
871   C->assembled = C->was_assembled = PETSC_TRUE;
872   C->num_ass++;
873   PetscFunctionReturn(PETSC_SUCCESS);
874 }
875 
876 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
877 {
878   Mat_Product                 *product = C->product;
879   MatProductType               ptype;
880   Mat                          A, B;
881   bool                         transA, transB;
882   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
883   MatProductData_SeqAIJKokkos *pdata;
884   MPI_Comm                     comm;
885   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
886 
887   PetscFunctionBegin;
888   MatCheckProduct(C, 1);
889   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
890   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
891   A = product->A;
892   B = product->B;
893   PetscCall(MatSeqAIJKokkosSyncDevice(A));
894   PetscCall(MatSeqAIJKokkosSyncDevice(B));
895   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
896   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
897   csrmatA = akok->csrmat;
898   csrmatB = bkok->csrmat;
899 
900   ptype = product->type;
901   // Take advantage of the symmetry if true
902   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
903     ptype                                          = MATPRODUCT_AB;
904     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
905   }
906   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
907     ptype                                          = MATPRODUCT_AB;
908     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
909   }
910 
911   switch (ptype) {
912   case MATPRODUCT_AB:
913     transA = false;
914     transB = false;
915     PetscCall(MatSetBlockSizesFromMats(C, A, B));
916     break;
917   case MATPRODUCT_AtB:
918     transA = true;
919     transB = false;
920     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
921     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
922     break;
923   case MATPRODUCT_ABt:
924     transA = false;
925     transB = true;
926     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
927     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
928     break;
929   default:
930     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
931   }
932   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
933   pdata->reusesym = product->api_user;
934 
935   /* TODO: add command line options to select spgemm algorithms */
936   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
937 
938   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
939 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
940   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
941   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
942   #endif
943 #endif
944   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
945 
946   PetscCall(PetscLogGpuTimeBegin());
947   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
948   if (transA) {
949     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
950     transA = false;
951   }
952 
953   if (transB) {
954     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
955     transB = false;
956   }
957 
958   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
959   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
960     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
961     calling new Mat_SeqAIJKokkos().
962     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
963   */
964   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
965 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
966   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
967   auto spgemmHandle = pdata->kh.get_spgemm_handle();
968   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
969 #endif
970   PetscCall(PetscLogGpuTimeEnd());
971 
972   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
973   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
974   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
975   PetscFunctionReturn(PETSC_SUCCESS);
976 }
977 
978 /* handles sparse matrix matrix ops */
979 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
980 {
981   Mat_Product *product = mat->product;
982   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
983 
984   PetscFunctionBegin;
985   MatCheckProduct(mat, 1);
986   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
987   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
988   if (Biskok && Ciskok) {
989     switch (product->type) {
990     case MATPRODUCT_AB:
991     case MATPRODUCT_AtB:
992     case MATPRODUCT_ABt:
993       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
994       break;
995     case MATPRODUCT_PtAP:
996     case MATPRODUCT_RARt:
997     case MATPRODUCT_ABC:
998       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
999       break;
1000     default:
1001       break;
1002     }
1003   } else { /* fallback for AIJ */
1004     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1005   }
1006   PetscFunctionReturn(PETSC_SUCCESS);
1007 }
1008 
1009 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1010 {
1011   Mat_SeqAIJKokkos *aijkok;
1012 
1013   PetscFunctionBegin;
1014   PetscCall(PetscLogGpuTimeBegin());
1015   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1016   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1017   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1018   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1019   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1020   PetscCall(PetscLogGpuTimeEnd());
1021   PetscFunctionReturn(PETSC_SUCCESS);
1022 }
1023 
1024 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1025 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1026 {
1027   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1028 
1029   PetscFunctionBegin;
1030   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1031     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1032 
1033     PetscCall(PetscLogGpuTimeBegin());
1034     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1035     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1036     const auto &Aa     = aijkok->a_dual.view_device();
1037     const auto &Adiag  = aijkok->diag_dual.view_device();
1038     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1039     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1040     PetscCall(PetscLogGpuFlops(n));
1041     PetscCall(PetscLogGpuTimeEnd());
1042   } else { // need reassembly, very slow!
1043     PetscCall(MatShift_Basic(A, a));
1044   }
1045   PetscFunctionReturn(PETSC_SUCCESS);
1046 }
1047 
1048 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1049 {
1050   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1051 
1052   PetscFunctionBegin;
1053   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1054     ConstPetscScalarKokkosView dv;
1055     PetscInt                   n, nv;
1056 
1057     PetscCall(PetscLogGpuTimeBegin());
1058     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1059     PetscCall(VecGetKokkosView(D, &dv));
1060     PetscCall(VecGetLocalSize(D, &nv));
1061     n = PetscMin(Y->rmap->n, Y->cmap->n);
1062     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1063 
1064     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1065     const auto &Aa     = aijkok->a_dual.view_device();
1066     const auto &Adiag  = aijkok->diag_dual.view_device();
1067     PetscCallCXX(Kokkos::parallel_for(
1068       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1069         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1070         else Aa(Adiag(i)) += dv(i);
1071       }));
1072     PetscCall(VecRestoreKokkosView(D, &dv));
1073     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1074     PetscCall(PetscLogGpuFlops(n));
1075     PetscCall(PetscLogGpuTimeEnd());
1076   } else { // need reassembly, very slow!
1077     PetscCall(MatDiagonalSet_Default(Y, D, is));
1078   }
1079   PetscFunctionReturn(PETSC_SUCCESS);
1080 }
1081 
1082 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1083 {
1084   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1085   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1086   ConstPetscScalarKokkosView lv, rv;
1087 
1088   PetscFunctionBegin;
1089   PetscCall(PetscLogGpuTimeBegin());
1090   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1091   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1092   const auto &Aa     = aijkok->a_dual.view_device();
1093   const auto &Ai     = aijkok->i_dual.view_device();
1094   const auto &Aj     = aijkok->j_dual.view_device();
1095   if (ll) {
1096     PetscCall(VecGetLocalSize(ll, &m));
1097     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1098     PetscCall(VecGetKokkosView(ll, &lv));
1099     PetscCallCXX(Kokkos::parallel_for( // for each row
1100       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1101         PetscInt i   = t.league_rank(); // row i
1102         PetscInt len = Ai(i + 1) - Ai(i);
1103         // scale entries on the row
1104         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1105       }));
1106     PetscCall(VecRestoreKokkosView(ll, &lv));
1107     PetscCall(PetscLogGpuFlops(nz));
1108   }
1109   if (rr) {
1110     PetscCall(VecGetLocalSize(rr, &n));
1111     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1112     PetscCall(VecGetKokkosView(rr, &rv));
1113     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1114       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1115     PetscCall(VecRestoreKokkosView(rr, &lv));
1116     PetscCall(PetscLogGpuFlops(nz));
1117   }
1118   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1119   PetscCall(PetscLogGpuTimeEnd());
1120   PetscFunctionReturn(PETSC_SUCCESS);
1121 }
1122 
1123 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1124 {
1125   Mat_SeqAIJKokkos *aijkok;
1126 
1127   PetscFunctionBegin;
1128   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1129   if (aijkok) { /* Only zero the device if data is already there */
1130     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1131     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1132   } else { /* Might be preallocated but not assembled */
1133     PetscCall(MatZeroEntries_SeqAIJ(A));
1134   }
1135   PetscFunctionReturn(PETSC_SUCCESS);
1136 }
1137 
1138 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1139 {
1140   Mat_SeqAIJKokkos     *aijkok;
1141   PetscInt              n;
1142   PetscScalarKokkosView xv;
1143 
1144   PetscFunctionBegin;
1145   PetscCall(VecGetLocalSize(x, &n));
1146   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1147   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1148 
1149   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1150   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1151 
1152   const auto &Aa    = aijkok->a_dual.view_device();
1153   const auto &Ai    = aijkok->i_dual.view_device();
1154   const auto &Adiag = aijkok->diag_dual.view_device();
1155 
1156   PetscCall(VecGetKokkosViewWrite(x, &xv));
1157   Kokkos::parallel_for(
1158     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1159       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1160       else xv(i) = 0;
1161     });
1162   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1163   PetscFunctionReturn(PETSC_SUCCESS);
1164 }
1165 
1166 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1167 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1168 {
1169   Mat_SeqAIJKokkos *aijkok;
1170 
1171   PetscFunctionBegin;
1172   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1173   PetscAssertPointer(kv, 2);
1174   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1175   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1176   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1177   *kv    = aijkok->a_dual.view_device();
1178   PetscFunctionReturn(PETSC_SUCCESS);
1179 }
1180 
1181 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1182 {
1183   PetscFunctionBegin;
1184   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1185   PetscAssertPointer(kv, 2);
1186   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1187   PetscFunctionReturn(PETSC_SUCCESS);
1188 }
1189 
1190 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1191 {
1192   Mat_SeqAIJKokkos *aijkok;
1193 
1194   PetscFunctionBegin;
1195   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1196   PetscAssertPointer(kv, 2);
1197   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1198   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1199   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1200   *kv    = aijkok->a_dual.view_device();
1201   PetscFunctionReturn(PETSC_SUCCESS);
1202 }
1203 
1204 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1205 {
1206   PetscFunctionBegin;
1207   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1208   PetscAssertPointer(kv, 2);
1209   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1210   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1211   PetscFunctionReturn(PETSC_SUCCESS);
1212 }
1213 
1214 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1215 {
1216   Mat_SeqAIJKokkos *aijkok;
1217 
1218   PetscFunctionBegin;
1219   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1220   PetscAssertPointer(kv, 2);
1221   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1222   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1223   *kv    = aijkok->a_dual.view_device();
1224   PetscFunctionReturn(PETSC_SUCCESS);
1225 }
1226 
1227 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1228 {
1229   PetscFunctionBegin;
1230   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1231   PetscAssertPointer(kv, 2);
1232   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1233   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1234   PetscFunctionReturn(PETSC_SUCCESS);
1235 }
1236 
1237 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1238 {
1239   Mat_SeqAIJKokkos *akok;
1240 
1241   PetscFunctionBegin;
1242   auto exec = PetscGetKokkosExecutionSpace();
1243   // Create host copies of the input aij
1244   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1245   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1246   // Don't copy the vals to the host now
1247   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1248 
1249   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1250   // Note we have modified device data so it will copy lazily
1251   a_dual.modify_device();
1252   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1253   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1254 
1255   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1256   PetscCall(MatCreate(comm, A));
1257   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1258   PetscFunctionReturn(PETSC_SUCCESS);
1259 }
1260 
1261 /* Computes Y += alpha X */
1262 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1263 {
1264   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1265   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1266   ConstMatScalarKokkosView Xa;
1267   MatScalarKokkosView      Ya;
1268   auto                     exec = PetscGetKokkosExecutionSpace();
1269 
1270   PetscFunctionBegin;
1271   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1272   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1273   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1274   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1275   PetscCall(PetscLogGpuTimeBegin());
1276 
1277   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1278     PetscBool e;
1279     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1280     if (e) {
1281       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1282       if (e) pattern = SAME_NONZERO_PATTERN;
1283     }
1284   }
1285 
1286   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1287     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1288     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1289     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1290   */
1291   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1292   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1293   Xa   = xkok->a_dual.view_device();
1294   Ya   = ykok->a_dual.view_device();
1295 
1296   if (pattern == SAME_NONZERO_PATTERN) {
1297     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1298     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1299   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1300     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1301     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1302 
1303     Kokkos::parallel_for(
1304       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1305         PetscInt i = t.league_rank(); // row i
1306         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1307           // Only one thread works in a team
1308           PetscInt p, q = Yi(i);
1309           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1310             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1311             if (Xj(p) == Yj(q)) {                        // Found it
1312               Ya(q) += alpha * Xa(p);
1313               q++;
1314             } else {
1315             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1316             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1317 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1318               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1319 #else
1320               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1321 #endif
1322             }
1323           }
1324         });
1325       });
1326     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1327   } else { // different nonzero patterns
1328     Mat             Z;
1329     KokkosCsrMatrix zcsr;
1330     KernelHandle    kh;
1331     kh.create_spadd_handle(true); // X, Y are sorted
1332     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1333     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1334     zkok = new Mat_SeqAIJKokkos(zcsr);
1335     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1336     PetscCall(MatHeaderReplace(Y, &Z));
1337     kh.destroy_spadd_handle();
1338   }
1339   PetscCall(PetscLogGpuTimeEnd());
1340   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1341   PetscFunctionReturn(PETSC_SUCCESS);
1342 }
1343 
1344 struct MatCOOStruct_SeqAIJKokkos {
1345   PetscCount           n;
1346   PetscCount           Atot;
1347   PetscInt             nz;
1348   PetscCountKokkosView jmap;
1349   PetscCountKokkosView perm;
1350 
1351   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1352   {
1353     nz   = coo_h->nz;
1354     n    = coo_h->n;
1355     Atot = coo_h->Atot;
1356     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1357     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1358   }
1359 };
1360 
1361 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1362 {
1363   PetscFunctionBegin;
1364   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1365   PetscFunctionReturn(PETSC_SUCCESS);
1366 }
1367 
1368 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1369 {
1370   Mat_SeqAIJKokkos          *akok;
1371   Mat_SeqAIJ                *aseq;
1372   PetscContainer             container_h;
1373   MatCOOStruct_SeqAIJ       *coo_h;
1374   MatCOOStruct_SeqAIJKokkos *coo_d;
1375 
1376   PetscFunctionBegin;
1377   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1378   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1379   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1380   delete akok;
1381   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1382   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1383 
1384   // Copy the COO struct to device
1385   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1386   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1387   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1388 
1389   // Put the COO struct in a container and then attach that to the matrix
1390   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1391   PetscFunctionReturn(PETSC_SUCCESS);
1392 }
1393 
1394 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1395 {
1396   MatScalarKokkosView        Aa;
1397   ConstMatScalarKokkosView   kv;
1398   PetscMemType               memtype;
1399   PetscContainer             container;
1400   MatCOOStruct_SeqAIJKokkos *coo;
1401 
1402   PetscFunctionBegin;
1403   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1404   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1405 
1406   const auto &n    = coo->n;
1407   const auto &Annz = coo->nz;
1408   const auto &jmap = coo->jmap;
1409   const auto &perm = coo->perm;
1410 
1411   PetscCall(PetscGetMemType(v, &memtype));
1412   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1413     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1414   } else {
1415     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1416   }
1417 
1418   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1419   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1420 
1421   PetscCall(PetscLogGpuTimeBegin());
1422   Kokkos::parallel_for(
1423     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1424       PetscScalar sum = 0.0;
1425       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1426       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1427     });
1428   PetscCall(PetscLogGpuTimeEnd());
1429 
1430   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1431   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1432   PetscFunctionReturn(PETSC_SUCCESS);
1433 }
1434 
1435 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1436 {
1437   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1438 
1439   PetscFunctionBegin;
1440   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1441   A->boundtocpu  = PETSC_FALSE;
1442 
1443   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1444   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1445   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1446   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1447   A->ops->scale                     = MatScale_SeqAIJKokkos;
1448   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1449   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1450   A->ops->mult                      = MatMult_SeqAIJKokkos;
1451   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1452   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1453   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1454   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1455   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1456   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1457   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1458   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1459   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1460   A->ops->shift                     = MatShift_SeqAIJKokkos;
1461   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1462   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1463   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1464   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1465   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1466   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1467   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1468   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1469   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1470 
1471   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1472   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1473 #if defined(PETSC_HAVE_HYPRE)
1474   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1475 #endif
1476   PetscFunctionReturn(PETSC_SUCCESS);
1477 }
1478 
1479 /*
1480    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1481 
1482   Input Parameters:
1483 +  A       - the MATSEQAIJKOKKOS matrix
1484 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1485 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1486 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1487 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1488 
1489   Output Parameter:
1490 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1491 */
1492 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1493 {
1494   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1495   PetscInt          nblocks = bs.extent(0) - 1;
1496 
1497   PetscFunctionBegin;
1498   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1499 
1500   // Pull out the diagonal blocks of the matrix and then invert the blocks
1501   auto Aa    = akok->a_dual.view_device();
1502   auto Ai    = akok->i_dual.view_device();
1503   auto Aj    = akok->j_dual.view_device();
1504   auto Adiag = akok->diag_dual.view_device();
1505   // TODO: how to tune the team size?
1506 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1507   auto ts = Kokkos::AUTO();
1508 #else
1509   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1510 #endif
1511   PetscCallCXX(Kokkos::parallel_for(
1512     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1513       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1514       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1515       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1516       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1517       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1518 
1519       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1520         PetscInt i = rstart + r;                                                            // i-th row in A
1521 
1522         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1523           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1524 
1525           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1526             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1527               B(r, c) = 0.0;
1528             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1529               B(r, c) = Aa(first + c);
1530             } else { // this entry does not show up in the CSR
1531               B(r, c) = 0.0;
1532             }
1533           }
1534         } else { // rare case that the diagonal does not exist
1535           const PetscInt begin = Ai(i);
1536           const PetscInt end   = Ai(i + 1);
1537           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1538           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1539             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1540             else if (Aj(j) >= rstart + m) break;
1541           }
1542         }
1543       });
1544 
1545       // LU-decompose B (w/o pivoting) and then invert B
1546       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1547       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1548     }));
1549   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1550   PetscFunctionReturn(PETSC_SUCCESS);
1551 }
1552 
1553 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1554 {
1555   Mat_SeqAIJ *aseq;
1556   PetscInt    i, m, n;
1557   auto        exec = PetscGetKokkosExecutionSpace();
1558 
1559   PetscFunctionBegin;
1560   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1561 
1562   m = akok->nrows();
1563   n = akok->ncols();
1564   PetscCall(MatSetSizes(A, m, n, m, n));
1565   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1566 
1567   /* Set up data structures of A as a MATSEQAIJ */
1568   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1569   aseq = (Mat_SeqAIJ *)A->data;
1570 
1571   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1572   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1573 
1574   aseq->i       = akok->i_host_data();
1575   aseq->j       = akok->j_host_data();
1576   aseq->a       = akok->a_host_data();
1577   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1578   aseq->free_a  = PETSC_FALSE;
1579   aseq->free_ij = PETSC_FALSE;
1580   aseq->nz      = akok->nnz();
1581   aseq->maxnz   = aseq->nz;
1582 
1583   PetscCall(PetscMalloc1(m, &aseq->imax));
1584   PetscCall(PetscMalloc1(m, &aseq->ilen));
1585   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1586 
1587   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1588   akok->nonzerostate = A->nonzerostate;
1589   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1590   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1591   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1592   PetscFunctionReturn(PETSC_SUCCESS);
1593 }
1594 
1595 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1596 {
1597   PetscFunctionBegin;
1598   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1599   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1600   PetscFunctionReturn(PETSC_SUCCESS);
1601 }
1602 
1603 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1604 {
1605   Mat_SeqAIJKokkos *akok;
1606 
1607   PetscFunctionBegin;
1608   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1609   PetscCall(MatCreate(comm, A));
1610   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1611   PetscFunctionReturn(PETSC_SUCCESS);
1612 }
1613 
1614 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1615 
1616    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1617  */
1618 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1619 {
1620   PetscFunctionBegin;
1621   PetscCall(MatCreate(comm, A));
1622   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1623   PetscFunctionReturn(PETSC_SUCCESS);
1624 }
1625 
1626 /*@C
1627   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1628   (the default parallel PETSc format). This matrix will ultimately be handled by
1629   Kokkos for calculations.
1630 
1631   Collective
1632 
1633   Input Parameters:
1634 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1635 . m    - number of rows
1636 . n    - number of columns
1637 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1638 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1639 
1640   Output Parameter:
1641 . A - the matrix
1642 
1643   Level: intermediate
1644 
1645   Notes:
1646   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1647   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1648   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1649 
1650   The AIJ format, also called
1651   compressed row storage, is fully compatible with standard Fortran
1652   storage.  That is, the stored row and column indices can begin at
1653   either one (as in Fortran) or zero.
1654 
1655   Specify the preallocated storage with either `nz` or `nnz` (not both).
1656   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1657   allocation.
1658 
1659 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1660 @*/
1661 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1662 {
1663   PetscFunctionBegin;
1664   PetscCall(PetscKokkosInitializeCheck());
1665   PetscCall(MatCreate(comm, A));
1666   PetscCall(MatSetSizes(*A, m, n, m, n));
1667   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1668   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1669   PetscFunctionReturn(PETSC_SUCCESS);
1670 }
1671 
1672 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1673 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1674 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1675 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1676 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1677 {
1678   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1679   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1680   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1681 
1682   PetscFunctionBegin;
1683   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1684     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1685     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1686     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1687   }
1688   PetscFunctionReturn(PETSC_SUCCESS);
1689 }
1690 
1691 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1692 {
1693   const PetscInt              n         = A->rmap->n;
1694   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1695   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1696   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1697 
1698   PetscFunctionBegin;
1699   if (!factors->transpose_updated) {
1700     if (has_upper) {
1701       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1702         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1703         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1704         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1705       }
1706 
1707       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1708         if (!factors->U) {
1709           Mat_SeqAIJ *seq;
1710 
1711           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1712           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1713 
1714           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1715           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1716           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1717           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1718         } else {
1719           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1720         }
1721         // Copy Ut from host to device
1722         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1723         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1724         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1725       } else { // If U was computed on device, we also compute the transpose there
1726         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1727         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1728                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1729                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1730                                                                                                                                                                                                                                factors->aUt_d));
1731         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1732       }
1733       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1734     }
1735 
1736     // do the same for L with LU
1737     if (has_lower) {
1738       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1739         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1740         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1741         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1742       }
1743 
1744       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1745         if (!factors->L) {
1746           Mat_SeqAIJ *seq;
1747 
1748           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1749           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1750 
1751           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1752           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1753           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1754           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1755         } else {
1756           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1757         }
1758         // Copy Lt from host to device
1759         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1760         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1761         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1762       } else { // If L was computed on device, we also compute the transpose there
1763         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1764         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1765                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1766                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1767                                                                                                                                                                                                                                factors->aLt_d));
1768         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1769       }
1770       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1771     }
1772 
1773     factors->transpose_updated = PETSC_TRUE;
1774   }
1775   PetscFunctionReturn(PETSC_SUCCESS);
1776 }
1777 
1778 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1779 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1780 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1781 {
1782   auto                        exec    = PetscGetKokkosExecutionSpace();
1783   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1784   PetscInt                    m       = A->rmap->n;
1785   PetscScalarKokkosView       D       = factors->D_d;
1786   PetscScalarKokkosView       X, Y, B; // alias
1787   ConstPetscScalarKokkosView  b;
1788   PetscScalarKokkosView       x;
1789   PetscIntKokkosView         &rowperm  = factors->rowperm;
1790   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1791 
1792   PetscFunctionBegin;
1793   PetscCall(PetscLogGpuTimeBegin());
1794   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1795   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1796   PetscCall(VecGetKokkosView(bb, &b));
1797   PetscCall(VecGetKokkosViewWrite(xx, &x));
1798 
1799   // Solve U^T Y = B
1800   if (identity) { // Reorder b with the row permutation
1801     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1802     Y = factors->workVector;
1803   } else {
1804     B = factors->workVector;
1805     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1806     Y = x;
1807   }
1808   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1809 
1810   // Solve diag(D) Y' = Y.
1811   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1812   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1813 
1814   // Solve UX = Y
1815   if (identity) {
1816     X = x;
1817   } else {
1818     X = factors->workVector; // B is not needed anymore
1819   }
1820   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1821 
1822   // Reorder X with the inverse column (row) permutation
1823   if (!identity) {
1824     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1825   }
1826 
1827   PetscCall(VecRestoreKokkosView(bb, &b));
1828   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1829   PetscCall(PetscLogGpuTimeEnd());
1830   PetscFunctionReturn(PETSC_SUCCESS);
1831 }
1832 
1833 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1834 // R and C are represented by rowperm and colperm in factors.
1835 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1836 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1837 {
1838   auto                        exec    = PetscGetKokkosExecutionSpace();
1839   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1840   PetscInt                    m       = A->rmap->n;
1841   PetscScalarKokkosView       X, Y, B; // alias
1842   ConstPetscScalarKokkosView  b;
1843   PetscScalarKokkosView       x;
1844   PetscIntKokkosView         &rowperm      = factors->rowperm;
1845   PetscIntKokkosView         &colperm      = factors->colperm;
1846   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1847   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1848 
1849   PetscFunctionBegin;
1850   PetscCall(PetscLogGpuTimeBegin());
1851   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1852   PetscCall(VecGetKokkosView(bb, &b));
1853   PetscCall(VecGetKokkosViewWrite(xx, &x));
1854 
1855   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1856   if (row_identity) {
1857     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1858     Y = factors->workVector;
1859   } else {
1860     B = factors->workVector;
1861     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1862     Y = x;
1863   }
1864   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1865 
1866   // Solve U C^- x = Y
1867   if (col_identity) {
1868     X = x;
1869   } else {
1870     X = factors->workVector;
1871   }
1872   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1873 
1874   // x = C X; Reorder X with the inverse col permutation
1875   if (!col_identity) {
1876     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1877   }
1878 
1879   PetscCall(VecRestoreKokkosView(bb, &b));
1880   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1881   PetscCall(PetscLogGpuTimeEnd());
1882   PetscFunctionReturn(PETSC_SUCCESS);
1883 }
1884 
1885 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1886 // R and C are represented by rowperm and colperm in factors.
1887 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1888 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1889 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1890 {
1891   auto                        exec    = PetscGetKokkosExecutionSpace();
1892   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1893   PetscInt                    m       = A->rmap->n;
1894   PetscScalarKokkosView       X, Y, B; // alias
1895   ConstPetscScalarKokkosView  b;
1896   PetscScalarKokkosView       x;
1897   PetscIntKokkosView         &rowperm      = factors->rowperm;
1898   PetscIntKokkosView         &colperm      = factors->colperm;
1899   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1900   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1901 
1902   PetscFunctionBegin;
1903   PetscCall(PetscLogGpuTimeBegin());
1904   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1905   PetscCall(VecGetKokkosView(bb, &b));
1906   PetscCall(VecGetKokkosViewWrite(xx, &x));
1907 
1908   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1909   if (col_identity) { // Reorder b with the col permutation
1910     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1911     Y = factors->workVector;
1912   } else {
1913     B = factors->workVector;
1914     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1915     Y = x;
1916   }
1917   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1918 
1919   // Solve L^T X = Y
1920   if (row_identity) {
1921     X = x;
1922   } else {
1923     X = factors->workVector;
1924   }
1925   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1926 
1927   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1928   if (!row_identity) {
1929     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1930   }
1931 
1932   PetscCall(VecRestoreKokkosView(bb, &b));
1933   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1934   PetscCall(PetscLogGpuTimeEnd());
1935   PetscFunctionReturn(PETSC_SUCCESS);
1936 }
1937 
1938 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1939 {
1940   PetscFunctionBegin;
1941   PetscCall(MatSeqAIJKokkosSyncHost(A));
1942   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1943 
1944   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1945     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1946     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1947     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1948     const MatScalar            *Ba = b->a;
1949     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1950 
1951     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1952       // Allocate memory and copy the structure
1953       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1954       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1955       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1956       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1957       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1958       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1959 
1960       PetscInt *Li = factors->iL_h.data();
1961       PetscInt *Lj = factors->jL_h.data();
1962       PetscInt *Ui = factors->iU_h.data();
1963       PetscInt *Uj = factors->jU_h.data();
1964 
1965       Li[0] = Ui[0] = 0;
1966       for (PetscInt i = 0; i < m; i++) {
1967         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1968         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1969 
1970         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1971         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1972 
1973         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1974         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1975 
1976         Li[i + 1] = Li[i] + llen + 1;
1977         Ui[i + 1] = Ui[i] + ulen;
1978       }
1979 
1980       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1981       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1982       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1983       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1984       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1985       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1986 
1987       // Copy row/col permutation to device
1988       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1989       PetscBool row_identity;
1990       PetscCall(ISIdentity(rowperm, &row_identity));
1991       if (!row_identity) {
1992         const PetscInt *ip;
1993 
1994         PetscCall(ISGetIndices(rowperm, &ip));
1995         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1996         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1997         PetscCall(ISRestoreIndices(rowperm, &ip));
1998         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1999       }
2000 
2001       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
2002       PetscBool col_identity;
2003       PetscCall(ISIdentity(colperm, &col_identity));
2004       if (!col_identity) {
2005         const PetscInt *ip;
2006 
2007         PetscCall(ISGetIndices(colperm, &ip));
2008         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2009         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2010         PetscCall(ISRestoreIndices(colperm, &ip));
2011         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2012       }
2013 
2014       /* Create sptrsv handles for L, U and their transpose */
2015 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2016       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2017 #else
2018       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2019 #endif
2020       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2021       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2022       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2023       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2024     }
2025 
2026     // Copy the value
2027     for (PetscInt i = 0; i < m; i++) {
2028       PetscInt        llen = Bi[i + 1] - Bi[i];
2029       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2030       const PetscInt *Li   = factors->iL_h.data();
2031       const PetscInt *Ui   = factors->iU_h.data();
2032 
2033       PetscScalar *La = factors->aL_h.data();
2034       PetscScalar *Ua = factors->aU_h.data();
2035 
2036       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2037       La[Li[i] + llen] = 1.0;                      // diagonal entry
2038 
2039       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2040       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2041     }
2042 
2043     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2044     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2045     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2046     factors->transpose_updated         = PETSC_FALSE;
2047     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2048 
2049     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2050     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2051   }
2052 
2053   B->ops->matsolve          = NULL;
2054   B->ops->matsolvetranspose = NULL;
2055   PetscFunctionReturn(PETSC_SUCCESS);
2056 }
2057 
2058 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2059 {
2060   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2061   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2062   PetscInt                    fill_lev = info->levels;
2063 
2064   PetscFunctionBegin;
2065   PetscCall(PetscLogGpuTimeBegin());
2066   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2067   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2068 
2069   auto a_d = aijkok->a_dual.view_device();
2070   auto i_d = aijkok->i_dual.view_device();
2071   auto j_d = aijkok->j_dual.view_device();
2072 
2073   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2074 
2075   B->assembled              = PETSC_TRUE;
2076   B->preallocated           = PETSC_TRUE;
2077   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2078   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2079   B->ops->matsolve          = NULL;
2080   B->ops->matsolvetranspose = NULL;
2081 
2082   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2083   factors->transpose_updated         = PETSC_FALSE;
2084   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2085   /* TODO: log flops, but how to know that? */
2086   PetscCall(PetscLogGpuTimeEnd());
2087   PetscFunctionReturn(PETSC_SUCCESS);
2088 }
2089 
2090 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2091 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2092 {
2093   Mat_SeqAIJKokkos           *aijkok;
2094   Mat_SeqAIJ                 *b;
2095   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2096   PetscInt                    fill_lev = info->levels;
2097   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2098   PetscInt                    n        = A->rmap->n;
2099 
2100   PetscFunctionBegin;
2101   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2102   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2103 
2104   /* Create a spiluk handle and then do symbolic factorization */
2105   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2106   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2107 
2108   auto spiluk_handle = factors->kh.get_spiluk_handle();
2109 
2110   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2111   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2112   Kokkos::realloc(factors->iU_d, n + 1);
2113   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2114 
2115   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2116   auto i_d = aijkok->i_dual.view_device();
2117   auto j_d = aijkok->j_dual.view_device();
2118   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2119   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2120 
2121   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2122   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2123   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2124   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2125 
2126   /* TODO: add options to select sptrsv algorithms */
2127   /* Create sptrsv handles for L, U and their transpose */
2128 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2129   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2130 #else
2131   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2132 #endif
2133 
2134   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2135   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2136   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2137   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2138 
2139   /* Fill fields of the factor matrix B */
2140   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2141   b     = (Mat_SeqAIJ *)B->data;
2142   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2143   B->info.fill_ratio_given  = info->fill;
2144   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2145 
2146   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2147   PetscFunctionReturn(PETSC_SUCCESS);
2148 }
2149 
2150 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2151 {
2152   PetscFunctionBegin;
2153   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2154   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2155   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2156   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2157   PetscFunctionReturn(PETSC_SUCCESS);
2158 }
2159 
2160 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2161 {
2162   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2163 
2164   PetscFunctionBegin;
2165   if (!info->factoronhost) {
2166     PetscCall(ISIdentity(isrow, &row_identity));
2167     PetscCall(ISIdentity(iscol, &col_identity));
2168   }
2169 
2170   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2171   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2172 
2173   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2174     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2175   } else {
2176     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2177     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2178   }
2179   PetscFunctionReturn(PETSC_SUCCESS);
2180 }
2181 
2182 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2183 {
2184   PetscFunctionBegin;
2185   PetscCall(MatSeqAIJKokkosSyncHost(A));
2186   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2187 
2188   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2189     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2190     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2191     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2192     const MatScalar            *Ba = b->a;
2193     PetscInt                    m  = B->rmap->n;
2194 
2195     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2196       // Allocate memory and copy the structure
2197       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2198       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2199       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2200       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2201       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2202       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2203 
2204       // Build jU_h from the skewed Aj
2205       PetscInt *Uj = factors->jU_h.data();
2206       for (PetscInt i = 0; i < m; i++) {
2207         PetscInt ulen = Bi[i + 1] - Bi[i];
2208         Uj[Bi[i]]     = i;                                              // diagonal entry
2209         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2210       }
2211 
2212       // Copy iU, jU to device
2213       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2214       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2215 
2216       // Copy row/col permutation to device
2217       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2218       PetscBool row_identity;
2219       PetscCall(ISIdentity(rowperm, &row_identity));
2220       if (!row_identity) {
2221         const PetscInt *ip;
2222 
2223         PetscCall(ISGetIndices(rowperm, &ip));
2224         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2225         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2226         PetscCall(ISRestoreIndices(rowperm, &ip));
2227         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2228       }
2229 
2230       // Create sptrsv handles for U and U^T
2231 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2232       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2233 #else
2234       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2235 #endif
2236       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2237       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2238     }
2239     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2240     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2241     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2242 
2243     // Copy the value
2244     PetscScalar *Ua = factors->aU_h.data();
2245     PetscScalar *D  = factors->D_h.data();
2246     for (PetscInt i = 0; i < m; i++) {
2247       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2248       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2249       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2250     }
2251     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2252     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2253 
2254     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2255     factors->transpose_updated         = PETSC_FALSE;
2256   }
2257 
2258   B->ops->matsolve          = NULL;
2259   B->ops->matsolvetranspose = NULL;
2260   PetscFunctionReturn(PETSC_SUCCESS);
2261 }
2262 
2263 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2264 {
2265   PetscFunctionBegin;
2266   if (info->solveonhost) {
2267     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2268     PetscCall(MatSetType(B, MATSEQSBAIJ));
2269     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2270   }
2271 
2272   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2273 
2274   if (!info->solveonhost) {
2275     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2276     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2277     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2278     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2279   }
2280   PetscFunctionReturn(PETSC_SUCCESS);
2281 }
2282 
2283 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2284 {
2285   PetscFunctionBegin;
2286   if (info->solveonhost) {
2287     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2288     PetscCall(MatSetType(B, MATSEQSBAIJ));
2289     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2290   }
2291 
2292   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2293 
2294   if (!info->solveonhost) {
2295     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2296     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2297     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2298     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2299   }
2300   PetscFunctionReturn(PETSC_SUCCESS);
2301 }
2302 
2303 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2304 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2305 {
2306   PetscFunctionBegin;
2307   *type = MATSOLVERKOKKOS;
2308   PetscFunctionReturn(PETSC_SUCCESS);
2309 }
2310 
2311 /*MC
2312   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2313   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2314 
2315   Level: beginner
2316 
2317 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2318 M*/
2319 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2320 {
2321   PetscInt n = A->rmap->n;
2322   MPI_Comm comm;
2323 
2324   PetscFunctionBegin;
2325   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2326   PetscCall(MatCreate(comm, B));
2327   PetscCall(MatSetSizes(*B, n, n, n, n));
2328   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2329   (*B)->factortype = ftype;
2330   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2331   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2332   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2333 
2334   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2335     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2336     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2337     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2338     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2339     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2340   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2341     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2342     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2343     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2344     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2345   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2346 
2347   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2348   (*B)->canuseordering = PETSC_TRUE;
2349   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2350   PetscFunctionReturn(PETSC_SUCCESS);
2351 }
2352 
2353 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2354 {
2355   PetscFunctionBegin;
2356   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2357   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2358   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2359   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2360   PetscFunctionReturn(PETSC_SUCCESS);
2361 }
2362 
2363 /* Utility to print out a KokkosCsrMatrix for debugging */
2364 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2365 {
2366   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2367   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2368   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2369   const PetscInt    *i  = iv.data();
2370   const PetscInt    *j  = jv.data();
2371   const PetscScalar *a  = av.data();
2372   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2373 
2374   PetscFunctionBegin;
2375   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2376   for (PetscInt k = 0; k < m; k++) {
2377     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2378     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2379     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2380   }
2381   PetscFunctionReturn(PETSC_SUCCESS);
2382 }
2383