xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 58ad77e8b9ee6fdbdfef97ebcff79a2d98620aab)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20 #include <KokkosSparse_spmv.hpp>
21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22 
23 #include <KokkosSparse_spiluk.hpp>
24 #include <KokkosSparse_sptrsv.hpp>
25 #include <KokkosSparse_spgemm.hpp>
26 #include <KokkosSparse_spadd.hpp>
27 #include <KokkosBatched_LU_Decl.hpp>
28 #include <KokkosBatched_InverseLU_Decl.hpp>
29 
30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31 
32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33   #include <KokkosSparse_Utils.hpp>
34 using KokkosSparse::sort_crs_matrix;
35 using KokkosSparse::Impl::transpose_matrix;
36 #else
37   #include <KokkosKernels_Sorting.hpp>
38 using KokkosKernels::sort_crs_matrix;
39 using KokkosKernels::Impl::transpose_matrix;
40 #endif
41 
42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43 using KokkosSparse::spiluk_symbolic;
44 using KokkosSparse::spiluk_numeric;
45 using KokkosSparse::sptrsv_symbolic;
46 using KokkosSparse::sptrsv_solve;
47 using KokkosSparse::Experimental::SPTRSVAlgorithm;
48 using KokkosSparse::Experimental::SPILUKAlgorithm;
49 #else
50 using KokkosSparse::Experimental::spiluk_symbolic;
51 using KokkosSparse::Experimental::spiluk_numeric;
52 using KokkosSparse::Experimental::sptrsv_symbolic;
53 using KokkosSparse::Experimental::sptrsv_solve;
54 using KokkosSparse::Experimental::SPTRSVAlgorithm;
55 using KokkosSparse::Experimental::SPILUKAlgorithm;
56 #endif
57 
58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59 
60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62    In the latter case, it is important to set a_dual's sync state correctly.
63  */
64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65 {
66   Mat_SeqAIJ       *aijseq;
67   Mat_SeqAIJKokkos *aijkok;
68 
69   PetscFunctionBegin;
70   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72 
73   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75 
76   /* If aijkok does not exist, we just copy i, j to device.
77      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78      In both cases, we build a new aijkok structure.
79   */
80   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81     delete aijkok;
82     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
83     A->spptr = aijkok;
84   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
85     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
86     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
87     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
88   }
89   PetscFunctionReturn(PETSC_SUCCESS);
90 }
91 
92 /* Sync CSR data to device if not yet */
93 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
94 {
95   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
96 
97   PetscFunctionBegin;
98   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
99   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
100   if (aijkok->a_dual.need_sync_device()) {
101     aijkok->a_dual.sync_device();
102     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
103     aijkok->hermitian_updated = PETSC_FALSE;
104   }
105   PetscFunctionReturn(PETSC_SUCCESS);
106 }
107 
108 /* Mark the CSR data on device as modified */
109 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
110 {
111   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
112 
113   PetscFunctionBegin;
114   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
115   aijkok->a_dual.clear_sync_state();
116   aijkok->a_dual.modify_device();
117   aijkok->transpose_updated = PETSC_FALSE;
118   aijkok->hermitian_updated = PETSC_FALSE;
119   PetscCall(MatSeqAIJInvalidateDiagonal(A));
120   PetscCall(PetscObjectStateIncrease((PetscObject)A));
121   PetscFunctionReturn(PETSC_SUCCESS);
122 }
123 
124 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127   auto              exec   = PetscGetKokkosExecutionSpace();
128 
129   PetscFunctionBegin;
130   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
131   /* We do not expect one needs factors on host  */
132   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
133   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
134   PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(aijkok->a_dual, exec));
135   PetscFunctionReturn(PETSC_SUCCESS);
136 }
137 
138 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
139 {
140   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
141 
142   PetscFunctionBegin;
143   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
144     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
145     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
146     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
147   */
148   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149     auto exec = PetscGetKokkosExecutionSpace();
150     PetscCallCXX(aijkok->a_dual.sync_host(exec));
151     PetscCallCXX(exec.fence());
152     *array = aijkok->a_dual.view_host().data();
153   } else { /* Happens when calling MatSetValues on a newly created matrix */
154     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
155   }
156   PetscFunctionReturn(PETSC_SUCCESS);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
160 {
161   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
162 
163   PetscFunctionBegin;
164   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
165   PetscFunctionReturn(PETSC_SUCCESS);
166 }
167 
168 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
169 {
170   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
171 
172   PetscFunctionBegin;
173   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
174     auto exec = PetscGetKokkosExecutionSpace();
175     PetscCallCXX(aijkok->a_dual.sync_host(exec));
176     PetscCallCXX(exec.fence());
177     *array = aijkok->a_dual.view_host().data();
178   } else {
179     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
180   }
181   PetscFunctionReturn(PETSC_SUCCESS);
182 }
183 
184 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
185 {
186   PetscFunctionBegin;
187   *array = NULL;
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
192 {
193   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
194 
195   PetscFunctionBegin;
196   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
197     *array = aijkok->a_dual.view_host().data();
198   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
199     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
200   }
201   PetscFunctionReturn(PETSC_SUCCESS);
202 }
203 
204 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
205 {
206   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
207 
208   PetscFunctionBegin;
209   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
210     aijkok->a_dual.clear_sync_state();
211     aijkok->a_dual.modify_host();
212   }
213   PetscFunctionReturn(PETSC_SUCCESS);
214 }
215 
216 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
217 {
218   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
219 
220   PetscFunctionBegin;
221   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
222 
223   if (i) *i = aijkok->i_device_data();
224   if (j) *j = aijkok->j_device_data();
225   if (a) {
226     aijkok->a_dual.sync_device();
227     *a = aijkok->a_device_data();
228   }
229   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
230   PetscFunctionReturn(PETSC_SUCCESS);
231 }
232 
233 /*
234   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
235 
236   Input Parameter:
237 .  A       - the MATSEQAIJKOKKOS matrix
238 
239   Output Parameters:
240 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
241 -  T_d    - the transpose on device, whose value array is allocated but not initialized
242 */
243 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
244 {
245   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
246   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
247   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
248   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
249   MatRowMapType          *Ti = Ti_h.data();
250   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
251   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
252   PetscInt               *Tj   = Tj_h.data();
253   PetscInt               *perm = perm_h.data();
254   PetscInt               *offset;
255 
256   PetscFunctionBegin;
257   // Populate Ti
258   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
259   Ti++;
260   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
261   Ti--;
262   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
263 
264   // Populate Tj and the permutation array
265   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
266   for (PetscInt i = 0; i < m; i++) {
267     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
268       PetscInt r    = Aj[j];                       // row r of T
269       PetscInt disp = Ti[r] + offset[r];
270 
271       Tj[disp]   = i; // col i of T
272       perm[disp] = j;
273       offset[r]++;
274     }
275   }
276   PetscCall(PetscFree(offset));
277 
278   // Sort each row of T, along with the permutation array
279   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
280 
281   // Output perm and T on device
282   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
283   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
284   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
285   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
286   PetscFunctionReturn(PETSC_SUCCESS);
287 }
288 
289 // Generate the transpose on device and cache it internally
290 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
291 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
292 {
293   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
294   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
295   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
296   KokkosCsrMatrix  &T = akok->csrmatT;
297 
298   PetscFunctionBegin;
299   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
300   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
301 
302   const auto &Aa = akok->a_dual.view_device();
303 
304   if (A->symmetric == PETSC_BOOL3_TRUE) {
305     *csrmatT = akok->csrmat;
306   } else {
307     // See if we already have a cached transpose and its value is up to date
308     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
309       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
310         const auto &perm = akok->transpose_perm; // get the permutation array
311         auto       &Ta   = T.values;
312 
313         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
314       }
315     } else { // Generate T of size n x m for the first time
316       MatRowMapKokkosView perm;
317 
318       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
319       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
320       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
321     }
322     akok->transpose_updated = PETSC_TRUE;
323     *csrmatT                = akok->csrmatT;
324   }
325   PetscFunctionReturn(PETSC_SUCCESS);
326 }
327 
328 // Generate the Hermitian on device and cache it internally
329 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
330 {
331   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
332   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
333   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
334   KokkosCsrMatrix  &T = akok->csrmatH;
335 
336   PetscFunctionBegin;
337   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
338   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
339 
340   const auto &Aa = akok->a_dual.view_device();
341 
342   if (A->hermitian == PETSC_BOOL3_TRUE) {
343     *csrmatH = akok->csrmat;
344   } else {
345     // See if we already have a cached hermitian and its value is up to date
346     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
347       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
348         const auto &perm = akok->transpose_perm; // get the permutation array
349         auto       &Ta   = T.values;
350 
351         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
352       }
353     } else { // Generate T of size n x m for the first time
354       MatRowMapKokkosView perm;
355 
356       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
357       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
358       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
359     }
360     akok->hermitian_updated = PETSC_TRUE;
361     *csrmatH                = akok->csrmatH;
362   }
363   PetscFunctionReturn(PETSC_SUCCESS);
364 }
365 
366 /* y = A x */
367 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
368 {
369   Mat_SeqAIJKokkos          *aijkok;
370   ConstPetscScalarKokkosView xv;
371   PetscScalarKokkosView      yv;
372 
373   PetscFunctionBegin;
374   PetscCall(PetscLogGpuTimeBegin());
375   PetscCall(MatSeqAIJKokkosSyncDevice(A));
376   PetscCall(VecGetKokkosView(xx, &xv));
377   PetscCall(VecGetKokkosViewWrite(yy, &yv));
378   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
379   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
380   PetscCall(VecRestoreKokkosView(xx, &xv));
381   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
382   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
383   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
384   PetscCall(PetscLogGpuTimeEnd());
385   PetscFunctionReturn(PETSC_SUCCESS);
386 }
387 
388 /* y = A^T x */
389 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
390 {
391   Mat_SeqAIJKokkos          *aijkok;
392   const char                *mode;
393   ConstPetscScalarKokkosView xv;
394   PetscScalarKokkosView      yv;
395   KokkosCsrMatrix            csrmat;
396 
397   PetscFunctionBegin;
398   PetscCall(PetscLogGpuTimeBegin());
399   PetscCall(MatSeqAIJKokkosSyncDevice(A));
400   PetscCall(VecGetKokkosView(xx, &xv));
401   PetscCall(VecGetKokkosViewWrite(yy, &yv));
402   if (A->form_explicit_transpose) {
403     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
404     mode = "N";
405   } else {
406     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
407     csrmat = aijkok->csrmat;
408     mode   = "T";
409   }
410   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
411   PetscCall(VecRestoreKokkosView(xx, &xv));
412   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
413   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
414   PetscCall(PetscLogGpuTimeEnd());
415   PetscFunctionReturn(PETSC_SUCCESS);
416 }
417 
418 /* y = A^H x */
419 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
420 {
421   Mat_SeqAIJKokkos          *aijkok;
422   const char                *mode;
423   ConstPetscScalarKokkosView xv;
424   PetscScalarKokkosView      yv;
425   KokkosCsrMatrix            csrmat;
426 
427   PetscFunctionBegin;
428   PetscCall(PetscLogGpuTimeBegin());
429   PetscCall(MatSeqAIJKokkosSyncDevice(A));
430   PetscCall(VecGetKokkosView(xx, &xv));
431   PetscCall(VecGetKokkosViewWrite(yy, &yv));
432   if (A->form_explicit_transpose) {
433     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
434     mode = "N";
435   } else {
436     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
437     csrmat = aijkok->csrmat;
438     mode   = "C";
439   }
440   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
441   PetscCall(VecRestoreKokkosView(xx, &xv));
442   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
443   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
444   PetscCall(PetscLogGpuTimeEnd());
445   PetscFunctionReturn(PETSC_SUCCESS);
446 }
447 
448 /* z = A x + y */
449 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
450 {
451   Mat_SeqAIJKokkos          *aijkok;
452   ConstPetscScalarKokkosView xv;
453   PetscScalarKokkosView      zv;
454 
455   PetscFunctionBegin;
456   PetscCall(PetscLogGpuTimeBegin());
457   PetscCall(MatSeqAIJKokkosSyncDevice(A));
458   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
459   PetscCall(VecGetKokkosView(xx, &xv));
460   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
461   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
462   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
463   PetscCall(VecRestoreKokkosView(xx, &xv));
464   PetscCall(VecRestoreKokkosView(zz, &zv));
465   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
466   PetscCall(PetscLogGpuTimeEnd());
467   PetscFunctionReturn(PETSC_SUCCESS);
468 }
469 
470 /* z = A^T x + y */
471 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
472 {
473   Mat_SeqAIJKokkos          *aijkok;
474   const char                *mode;
475   ConstPetscScalarKokkosView xv;
476   PetscScalarKokkosView      zv;
477   KokkosCsrMatrix            csrmat;
478 
479   PetscFunctionBegin;
480   PetscCall(PetscLogGpuTimeBegin());
481   PetscCall(MatSeqAIJKokkosSyncDevice(A));
482   if (zz != yy) PetscCall(VecCopy(yy, zz));
483   PetscCall(VecGetKokkosView(xx, &xv));
484   PetscCall(VecGetKokkosView(zz, &zv));
485   if (A->form_explicit_transpose) {
486     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
487     mode = "N";
488   } else {
489     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
490     csrmat = aijkok->csrmat;
491     mode   = "T";
492   }
493   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
494   PetscCall(VecRestoreKokkosView(xx, &xv));
495   PetscCall(VecRestoreKokkosView(zz, &zv));
496   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
497   PetscCall(PetscLogGpuTimeEnd());
498   PetscFunctionReturn(PETSC_SUCCESS);
499 }
500 
501 /* z = A^H x + y */
502 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
503 {
504   Mat_SeqAIJKokkos          *aijkok;
505   const char                *mode;
506   ConstPetscScalarKokkosView xv;
507   PetscScalarKokkosView      zv;
508   KokkosCsrMatrix            csrmat;
509 
510   PetscFunctionBegin;
511   PetscCall(PetscLogGpuTimeBegin());
512   PetscCall(MatSeqAIJKokkosSyncDevice(A));
513   if (zz != yy) PetscCall(VecCopy(yy, zz));
514   PetscCall(VecGetKokkosView(xx, &xv));
515   PetscCall(VecGetKokkosView(zz, &zv));
516   if (A->form_explicit_transpose) {
517     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
518     mode = "N";
519   } else {
520     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
521     csrmat = aijkok->csrmat;
522     mode   = "C";
523   }
524   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
525   PetscCall(VecRestoreKokkosView(xx, &xv));
526   PetscCall(VecRestoreKokkosView(zz, &zv));
527   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
528   PetscCall(PetscLogGpuTimeEnd());
529   PetscFunctionReturn(PETSC_SUCCESS);
530 }
531 
532 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
533 {
534   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
535 
536   PetscFunctionBegin;
537   switch (op) {
538   case MAT_FORM_EXPLICIT_TRANSPOSE:
539     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
540     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
541     A->form_explicit_transpose = flg;
542     break;
543   default:
544     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
545     break;
546   }
547   PetscFunctionReturn(PETSC_SUCCESS);
548 }
549 
550 /* Depending on reuse, either build a new mat, or use the existing mat */
551 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
552 {
553   Mat_SeqAIJ *aseq;
554 
555   PetscFunctionBegin;
556   PetscCall(PetscKokkosInitializeCheck());
557   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
558     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
559   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
560     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
561   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
562     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
563     PetscCall(PetscFree(A->defaultvectype));
564     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
565     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
566     PetscCall(MatSetOps_SeqAIJKokkos(A));
567     aseq = static_cast<Mat_SeqAIJ *>(A->data);
568     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
569       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
570       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
571     }
572   }
573   PetscFunctionReturn(PETSC_SUCCESS);
574 }
575 
576 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
577    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
578  */
579 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
580 {
581   Mat_SeqAIJ       *bseq;
582   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
583   Mat               mat;
584 
585   PetscFunctionBegin;
586   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
587   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
588   mat = *B;
589   if (A->assembled) {
590     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
591     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
592     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
593     /* Now copy values to B if needed */
594     if (dupOption == MAT_COPY_VALUES) {
595       if (akok->a_dual.need_sync_device()) {
596         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
597         bkok->a_dual.modify_host();
598       } else { /* If device has the latest data, we only copy data on device */
599         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
600         bkok->a_dual.modify_device();
601       }
602     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
603       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
604       bkok->a_dual.modify_host();
605     }
606     mat->spptr = bkok;
607   }
608 
609   PetscCall(PetscFree(mat->defaultvectype));
610   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
611   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
612   PetscCall(MatSetOps_SeqAIJKokkos(mat));
613   PetscFunctionReturn(PETSC_SUCCESS);
614 }
615 
616 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
617 {
618   Mat               At;
619   KokkosCsrMatrix   internT;
620   Mat_SeqAIJKokkos *atkok, *bkok;
621 
622   PetscFunctionBegin;
623   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
624   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
625   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
626     /* Deep copy internT, as we want to isolate the internal transpose */
627     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
628     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
629     if (reuse == MAT_INITIAL_MATRIX) *B = At;
630     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
631   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
632     if ((*B)->assembled) {
633       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
634       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
635       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
636     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
637       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
638       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
639       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
640       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
641       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
642     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
643   }
644   PetscFunctionReturn(PETSC_SUCCESS);
645 }
646 
647 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
648 {
649   Mat_SeqAIJKokkos *aijkok;
650 
651   PetscFunctionBegin;
652   if (A->factortype == MAT_FACTOR_NONE) {
653     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
654     delete aijkok;
655   } else {
656     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
657   }
658   A->spptr = NULL;
659   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
660   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
661   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
662 #if defined(PETSC_HAVE_HYPRE)
663   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
664 #endif
665   PetscCall(MatDestroy_SeqAIJ(A));
666   PetscFunctionReturn(PETSC_SUCCESS);
667 }
668 
669 /*MC
670    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
671 
672    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
673 
674    Options Database Key:
675 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
676 
677   Level: beginner
678 
679 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
680 M*/
681 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
682 {
683   PetscFunctionBegin;
684   PetscCall(PetscKokkosInitializeCheck());
685   PetscCall(MatCreate_SeqAIJ(A));
686   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
687   PetscFunctionReturn(PETSC_SUCCESS);
688 }
689 
690 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
691 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
692 {
693   Mat_SeqAIJ         *a, *b;
694   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
695   MatScalarKokkosView aa, ba, ca;
696   MatRowMapKokkosView ai, bi, ci;
697   MatColIdxKokkosView aj, bj, cj;
698   PetscInt            m, n, nnz, aN;
699 
700   PetscFunctionBegin;
701   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
702   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
703   PetscAssertPointer(C, 4);
704   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
705   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
706   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
707   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
708 
709   PetscCall(MatSeqAIJKokkosSyncDevice(A));
710   PetscCall(MatSeqAIJKokkosSyncDevice(B));
711   a    = static_cast<Mat_SeqAIJ *>(A->data);
712   b    = static_cast<Mat_SeqAIJ *>(B->data);
713   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
714   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
715   aa   = akok->a_dual.view_device();
716   ai   = akok->i_dual.view_device();
717   ba   = bkok->a_dual.view_device();
718   bi   = bkok->i_dual.view_device();
719   m    = A->rmap->n; /* M, N and nnz of C */
720   n    = A->cmap->n + B->cmap->n;
721   nnz  = a->nz + b->nz;
722   aN   = A->cmap->n; /* N of A */
723   if (reuse == MAT_INITIAL_MATRIX) {
724     aj           = akok->j_dual.view_device();
725     bj           = bkok->j_dual.view_device();
726     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
727     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
728     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
729     ca           = ca_dual.view_device();
730     ci           = ci_dual.view_device();
731     cj           = cj_dual.view_device();
732 
733     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
734     Kokkos::parallel_for(
735       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
736         PetscInt i       = t.league_rank(); /* row i */
737         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
738 
739         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
740                                                    ci(i) = coffset;
741                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
742         });
743 
744         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
745           if (k < alen) {
746             ca(coffset + k) = aa(ai(i) + k);
747             cj(coffset + k) = aj(ai(i) + k);
748           } else {
749             ca(coffset + k) = ba(bi(i) + k - alen);
750             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
751           }
752         });
753       });
754     ca_dual.modify_device();
755     ci_dual.modify_device();
756     cj_dual.modify_device();
757     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
758     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
759   } else if (reuse == MAT_REUSE_MATRIX) {
760     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
761     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
762     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
763     ca   = ckok->a_dual.view_device();
764     ci   = ckok->i_dual.view_device();
765 
766     Kokkos::parallel_for(
767       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
768         PetscInt i    = t.league_rank(); /* row i */
769         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
770         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
771           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
772           else ca(ci(i) + k) = ba(bi(i) + k - alen);
773         });
774       });
775     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
776   }
777   PetscFunctionReturn(PETSC_SUCCESS);
778 }
779 
780 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
781 {
782   PetscFunctionBegin;
783   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
784   PetscFunctionReturn(PETSC_SUCCESS);
785 }
786 
787 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
788 {
789   Mat_Product                 *product = C->product;
790   Mat                          A, B;
791   bool                         transA, transB; /* use bool, since KK needs this type */
792   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
793   Mat_SeqAIJ                  *c;
794   MatProductData_SeqAIJKokkos *pdata;
795   KokkosCsrMatrix              csrmatA, csrmatB;
796 
797   PetscFunctionBegin;
798   MatCheckProduct(C, 1);
799   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
800   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
801 
802   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
803   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
804   // we still do numeric.
805   if (pdata->reusesym) { // numeric reuses results from symbolic
806     pdata->reusesym = PETSC_FALSE;
807     PetscFunctionReturn(PETSC_SUCCESS);
808   }
809 
810   switch (product->type) {
811   case MATPRODUCT_AB:
812     transA = false;
813     transB = false;
814     break;
815   case MATPRODUCT_AtB:
816     transA = true;
817     transB = false;
818     break;
819   case MATPRODUCT_ABt:
820     transA = false;
821     transB = true;
822     break;
823   default:
824     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
825   }
826 
827   A = product->A;
828   B = product->B;
829   PetscCall(MatSeqAIJKokkosSyncDevice(A));
830   PetscCall(MatSeqAIJKokkosSyncDevice(B));
831   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
832   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
833   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
834 
835   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
836 
837   csrmatA = akok->csrmat;
838   csrmatB = bkok->csrmat;
839 
840   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
841   if (transA) {
842     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
843     transA = false;
844   }
845 
846   if (transB) {
847     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
848     transB = false;
849   }
850   PetscCall(PetscLogGpuTimeBegin());
851   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
852 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
853   auto spgemmHandle = pdata->kh.get_spgemm_handle();
854   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
855 #endif
856 
857   PetscCall(PetscLogGpuTimeEnd());
858   PetscCall(MatSeqAIJKokkosModifyDevice(C));
859   /* shorter version of MatAssemblyEnd_SeqAIJ */
860   c = (Mat_SeqAIJ *)C->data;
861   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
862   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
863   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
864   c->reallocs         = 0;
865   C->info.mallocs     = 0;
866   C->info.nz_unneeded = 0;
867   C->assembled = C->was_assembled = PETSC_TRUE;
868   C->num_ass++;
869   PetscFunctionReturn(PETSC_SUCCESS);
870 }
871 
872 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
873 {
874   Mat_Product                 *product = C->product;
875   MatProductType               ptype;
876   Mat                          A, B;
877   bool                         transA, transB;
878   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
879   MatProductData_SeqAIJKokkos *pdata;
880   MPI_Comm                     comm;
881   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
882 
883   PetscFunctionBegin;
884   MatCheckProduct(C, 1);
885   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
886   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
887   A = product->A;
888   B = product->B;
889   PetscCall(MatSeqAIJKokkosSyncDevice(A));
890   PetscCall(MatSeqAIJKokkosSyncDevice(B));
891   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
892   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
893   csrmatA = akok->csrmat;
894   csrmatB = bkok->csrmat;
895 
896   ptype = product->type;
897   // Take advantage of the symmetry if true
898   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
899     ptype                                          = MATPRODUCT_AB;
900     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
901   }
902   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
903     ptype                                          = MATPRODUCT_AB;
904     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
905   }
906 
907   switch (ptype) {
908   case MATPRODUCT_AB:
909     transA = false;
910     transB = false;
911     PetscCall(MatSetBlockSizesFromMats(C, A, B));
912     break;
913   case MATPRODUCT_AtB:
914     transA = true;
915     transB = false;
916     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
917     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
918     break;
919   case MATPRODUCT_ABt:
920     transA = false;
921     transB = true;
922     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
923     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
924     break;
925   default:
926     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
927   }
928   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
929   pdata->reusesym = product->api_user;
930 
931   /* TODO: add command line options to select spgemm algorithms */
932   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
933 
934   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
935 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
936   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
937   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
938   #endif
939 #endif
940   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
941 
942   PetscCall(PetscLogGpuTimeBegin());
943   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
944   if (transA) {
945     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
946     transA = false;
947   }
948 
949   if (transB) {
950     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
951     transB = false;
952   }
953 
954   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
955   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
956     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
957     calling new Mat_SeqAIJKokkos().
958     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
959   */
960   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
961 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
962   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
963   auto spgemmHandle = pdata->kh.get_spgemm_handle();
964   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
965 #endif
966   PetscCall(PetscLogGpuTimeEnd());
967 
968   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
969   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
970   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
971   PetscFunctionReturn(PETSC_SUCCESS);
972 }
973 
974 /* handles sparse matrix matrix ops */
975 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
976 {
977   Mat_Product *product = mat->product;
978   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
979 
980   PetscFunctionBegin;
981   MatCheckProduct(mat, 1);
982   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
983   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
984   if (Biskok && Ciskok) {
985     switch (product->type) {
986     case MATPRODUCT_AB:
987     case MATPRODUCT_AtB:
988     case MATPRODUCT_ABt:
989       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
990       break;
991     case MATPRODUCT_PtAP:
992     case MATPRODUCT_RARt:
993     case MATPRODUCT_ABC:
994       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
995       break;
996     default:
997       break;
998     }
999   } else { /* fallback for AIJ */
1000     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1001   }
1002   PetscFunctionReturn(PETSC_SUCCESS);
1003 }
1004 
1005 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1006 {
1007   Mat_SeqAIJKokkos *aijkok;
1008 
1009   PetscFunctionBegin;
1010   PetscCall(PetscLogGpuTimeBegin());
1011   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1012   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1013   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1014   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1015   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1016   PetscCall(PetscLogGpuTimeEnd());
1017   PetscFunctionReturn(PETSC_SUCCESS);
1018 }
1019 
1020 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1021 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1022 {
1023   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1024 
1025   PetscFunctionBegin;
1026   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1027     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1028 
1029     PetscCall(PetscLogGpuTimeBegin());
1030     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1031     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1032     const auto &Aa     = aijkok->a_dual.view_device();
1033     const auto &Adiag  = aijkok->diag_dual.view_device();
1034     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1035     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1036     PetscCall(PetscLogGpuFlops(n));
1037     PetscCall(PetscLogGpuTimeEnd());
1038   } else { // need reassembly, very slow!
1039     PetscCall(MatShift_Basic(A, a));
1040   }
1041   PetscFunctionReturn(PETSC_SUCCESS);
1042 }
1043 
1044 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1045 {
1046   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1047 
1048   PetscFunctionBegin;
1049   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1050     ConstPetscScalarKokkosView dv;
1051     PetscInt                   n, nv;
1052 
1053     PetscCall(PetscLogGpuTimeBegin());
1054     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1055     PetscCall(VecGetKokkosView(D, &dv));
1056     PetscCall(VecGetLocalSize(D, &nv));
1057     n = PetscMin(Y->rmap->n, Y->cmap->n);
1058     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1059 
1060     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1061     const auto &Aa     = aijkok->a_dual.view_device();
1062     const auto &Adiag  = aijkok->diag_dual.view_device();
1063     PetscCallCXX(Kokkos::parallel_for(
1064       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1065         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1066         else Aa(Adiag(i)) += dv(i);
1067       }));
1068     PetscCall(VecRestoreKokkosView(D, &dv));
1069     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1070     PetscCall(PetscLogGpuFlops(n));
1071     PetscCall(PetscLogGpuTimeEnd());
1072   } else { // need reassembly, very slow!
1073     PetscCall(MatDiagonalSet_Default(Y, D, is));
1074   }
1075   PetscFunctionReturn(PETSC_SUCCESS);
1076 }
1077 
1078 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1079 {
1080   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1081   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1082   ConstPetscScalarKokkosView lv, rv;
1083 
1084   PetscFunctionBegin;
1085   PetscCall(PetscLogGpuTimeBegin());
1086   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1087   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1088   const auto &Aa     = aijkok->a_dual.view_device();
1089   const auto &Ai     = aijkok->i_dual.view_device();
1090   const auto &Aj     = aijkok->j_dual.view_device();
1091   if (ll) {
1092     PetscCall(VecGetLocalSize(ll, &m));
1093     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1094     PetscCall(VecGetKokkosView(ll, &lv));
1095     PetscCallCXX(Kokkos::parallel_for( // for each row
1096       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1097         PetscInt i   = t.league_rank(); // row i
1098         PetscInt len = Ai(i + 1) - Ai(i);
1099         // scale entries on the row
1100         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1101       }));
1102     PetscCall(VecRestoreKokkosView(ll, &lv));
1103     PetscCall(PetscLogGpuFlops(nz));
1104   }
1105   if (rr) {
1106     PetscCall(VecGetLocalSize(rr, &n));
1107     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1108     PetscCall(VecGetKokkosView(rr, &rv));
1109     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1110       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1111     PetscCall(VecRestoreKokkosView(rr, &lv));
1112     PetscCall(PetscLogGpuFlops(nz));
1113   }
1114   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1115   PetscCall(PetscLogGpuTimeEnd());
1116   PetscFunctionReturn(PETSC_SUCCESS);
1117 }
1118 
1119 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1120 {
1121   Mat_SeqAIJKokkos *aijkok;
1122 
1123   PetscFunctionBegin;
1124   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1125   if (aijkok) { /* Only zero the device if data is already there */
1126     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1127     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1128   } else { /* Might be preallocated but not assembled */
1129     PetscCall(MatZeroEntries_SeqAIJ(A));
1130   }
1131   PetscFunctionReturn(PETSC_SUCCESS);
1132 }
1133 
1134 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1135 {
1136   Mat_SeqAIJKokkos     *aijkok;
1137   PetscInt              n;
1138   PetscScalarKokkosView xv;
1139 
1140   PetscFunctionBegin;
1141   PetscCall(VecGetLocalSize(x, &n));
1142   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1143   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1144 
1145   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1146   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1147 
1148   const auto &Aa    = aijkok->a_dual.view_device();
1149   const auto &Ai    = aijkok->i_dual.view_device();
1150   const auto &Adiag = aijkok->diag_dual.view_device();
1151 
1152   PetscCall(VecGetKokkosViewWrite(x, &xv));
1153   Kokkos::parallel_for(
1154     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1155       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1156       else xv(i) = 0;
1157     });
1158   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1159   PetscFunctionReturn(PETSC_SUCCESS);
1160 }
1161 
1162 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1163 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1164 {
1165   Mat_SeqAIJKokkos *aijkok;
1166 
1167   PetscFunctionBegin;
1168   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1169   PetscAssertPointer(kv, 2);
1170   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1171   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1172   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1173   *kv    = aijkok->a_dual.view_device();
1174   PetscFunctionReturn(PETSC_SUCCESS);
1175 }
1176 
1177 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1178 {
1179   PetscFunctionBegin;
1180   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1181   PetscAssertPointer(kv, 2);
1182   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1183   PetscFunctionReturn(PETSC_SUCCESS);
1184 }
1185 
1186 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1187 {
1188   Mat_SeqAIJKokkos *aijkok;
1189 
1190   PetscFunctionBegin;
1191   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1192   PetscAssertPointer(kv, 2);
1193   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1194   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1195   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1196   *kv    = aijkok->a_dual.view_device();
1197   PetscFunctionReturn(PETSC_SUCCESS);
1198 }
1199 
1200 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1201 {
1202   PetscFunctionBegin;
1203   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1204   PetscAssertPointer(kv, 2);
1205   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1206   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1207   PetscFunctionReturn(PETSC_SUCCESS);
1208 }
1209 
1210 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1211 {
1212   Mat_SeqAIJKokkos *aijkok;
1213 
1214   PetscFunctionBegin;
1215   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1216   PetscAssertPointer(kv, 2);
1217   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1218   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1219   *kv    = aijkok->a_dual.view_device();
1220   PetscFunctionReturn(PETSC_SUCCESS);
1221 }
1222 
1223 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1224 {
1225   PetscFunctionBegin;
1226   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1227   PetscAssertPointer(kv, 2);
1228   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1229   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1230   PetscFunctionReturn(PETSC_SUCCESS);
1231 }
1232 
1233 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1234 {
1235   Mat_SeqAIJKokkos *akok;
1236 
1237   PetscFunctionBegin;
1238   // Create host copies of the input aij
1239   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1240   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1241   // Don't copy the vals to the host now
1242   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1243 
1244   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1245   // Note we have modified device data so it will copy lazily
1246   a_dual.modify_device();
1247   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1248   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1249 
1250   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1251   PetscCall(MatCreate(comm, A));
1252   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1253   PetscFunctionReturn(PETSC_SUCCESS);
1254 }
1255 
1256 /* Computes Y += alpha X */
1257 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1258 {
1259   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1260   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1261   ConstMatScalarKokkosView Xa;
1262   MatScalarKokkosView      Ya;
1263   auto                     exec = PetscGetKokkosExecutionSpace();
1264 
1265   PetscFunctionBegin;
1266   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1267   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1268   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1269   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1270   PetscCall(PetscLogGpuTimeBegin());
1271 
1272   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1273     PetscBool e;
1274     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1275     if (e) {
1276       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1277       if (e) pattern = SAME_NONZERO_PATTERN;
1278     }
1279   }
1280 
1281   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1282     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1283     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1284     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1285   */
1286   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1287   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1288   Xa   = xkok->a_dual.view_device();
1289   Ya   = ykok->a_dual.view_device();
1290 
1291   if (pattern == SAME_NONZERO_PATTERN) {
1292     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1293     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1294   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1295     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1296     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1297 
1298     Kokkos::parallel_for(
1299       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1300         PetscInt i = t.league_rank(); // row i
1301         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1302           // Only one thread works in a team
1303           PetscInt p, q = Yi(i);
1304           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1305             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1306             if (Xj(p) == Yj(q)) {                        // Found it
1307               Ya(q) += alpha * Xa(p);
1308               q++;
1309             } else {
1310             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1311             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1312 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1313               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1314 #else
1315               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1316 #endif
1317             }
1318           }
1319         });
1320       });
1321     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1322   } else { // different nonzero patterns
1323     Mat             Z;
1324     KokkosCsrMatrix zcsr;
1325     KernelHandle    kh;
1326     kh.create_spadd_handle(true); // X, Y are sorted
1327     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1328     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1329     zkok = new Mat_SeqAIJKokkos(zcsr);
1330     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1331     PetscCall(MatHeaderReplace(Y, &Z));
1332     kh.destroy_spadd_handle();
1333   }
1334   PetscCall(PetscLogGpuTimeEnd());
1335   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1336   PetscFunctionReturn(PETSC_SUCCESS);
1337 }
1338 
1339 struct MatCOOStruct_SeqAIJKokkos {
1340   PetscCount           n;
1341   PetscCount           Atot;
1342   PetscInt             nz;
1343   PetscCountKokkosView jmap;
1344   PetscCountKokkosView perm;
1345 
1346   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1347   {
1348     nz   = coo_h->nz;
1349     n    = coo_h->n;
1350     Atot = coo_h->Atot;
1351     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1352     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1353   }
1354 };
1355 
1356 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1357 {
1358   PetscFunctionBegin;
1359   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1360   PetscFunctionReturn(PETSC_SUCCESS);
1361 }
1362 
1363 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1364 {
1365   Mat_SeqAIJKokkos          *akok;
1366   Mat_SeqAIJ                *aseq;
1367   PetscContainer             container_h;
1368   MatCOOStruct_SeqAIJ       *coo_h;
1369   MatCOOStruct_SeqAIJKokkos *coo_d;
1370 
1371   PetscFunctionBegin;
1372   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1373   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1374   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1375   delete akok;
1376   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1377   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1378 
1379   // Copy the COO struct to device
1380   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1381   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1382   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1383 
1384   // Put the COO struct in a container and then attach that to the matrix
1385   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1386   PetscFunctionReturn(PETSC_SUCCESS);
1387 }
1388 
1389 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1390 {
1391   MatScalarKokkosView        Aa;
1392   ConstMatScalarKokkosView   kv;
1393   PetscMemType               memtype;
1394   PetscContainer             container;
1395   MatCOOStruct_SeqAIJKokkos *coo;
1396 
1397   PetscFunctionBegin;
1398   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1399   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1400 
1401   const auto &n    = coo->n;
1402   const auto &Annz = coo->nz;
1403   const auto &jmap = coo->jmap;
1404   const auto &perm = coo->perm;
1405 
1406   PetscCall(PetscGetMemType(v, &memtype));
1407   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1408     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1409   } else {
1410     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1411   }
1412 
1413   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1414   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1415 
1416   PetscCall(PetscLogGpuTimeBegin());
1417   Kokkos::parallel_for(
1418     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1419       PetscScalar sum = 0.0;
1420       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1421       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1422     });
1423   PetscCall(PetscLogGpuTimeEnd());
1424 
1425   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1426   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1427   PetscFunctionReturn(PETSC_SUCCESS);
1428 }
1429 
1430 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1431 {
1432   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1433 
1434   PetscFunctionBegin;
1435   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1436   A->boundtocpu  = PETSC_FALSE;
1437 
1438   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1439   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1440   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1441   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1442   A->ops->scale                     = MatScale_SeqAIJKokkos;
1443   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1444   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1445   A->ops->mult                      = MatMult_SeqAIJKokkos;
1446   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1447   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1448   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1449   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1450   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1451   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1452   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1453   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1454   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1455   A->ops->shift                     = MatShift_SeqAIJKokkos;
1456   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1457   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1458   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1459   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1460   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1461   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1462   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1463   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1464   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1465 
1466   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1467   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1468 #if defined(PETSC_HAVE_HYPRE)
1469   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1470 #endif
1471   PetscFunctionReturn(PETSC_SUCCESS);
1472 }
1473 
1474 /*
1475    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1476 
1477   Input Parameters:
1478 +  A       - the MATSEQAIJKOKKOS matrix
1479 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1480 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1481 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1482 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1483 
1484   Output Parameter:
1485 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1486 */
1487 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1488 {
1489   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1490   PetscInt          nblocks = bs.extent(0) - 1;
1491 
1492   PetscFunctionBegin;
1493   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1494 
1495   // Pull out the diagonal blocks of the matrix and then invert the blocks
1496   auto Aa    = akok->a_dual.view_device();
1497   auto Ai    = akok->i_dual.view_device();
1498   auto Aj    = akok->j_dual.view_device();
1499   auto Adiag = akok->diag_dual.view_device();
1500   // TODO: how to tune the team size?
1501 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1502   auto ts = Kokkos::AUTO();
1503 #else
1504   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1505 #endif
1506   PetscCallCXX(Kokkos::parallel_for(
1507     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1508       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1509       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1510       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1511       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1512       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1513 
1514       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1515         PetscInt i = rstart + r;                                                            // i-th row in A
1516 
1517         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1518           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1519 
1520           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1521             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1522               B(r, c) = 0.0;
1523             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1524               B(r, c) = Aa(first + c);
1525             } else { // this entry does not show up in the CSR
1526               B(r, c) = 0.0;
1527             }
1528           }
1529         } else { // rare case that the diagonal does not exist
1530           const PetscInt begin = Ai(i);
1531           const PetscInt end   = Ai(i + 1);
1532           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1533           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1534             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1535             else if (Aj(j) >= rstart + m) break;
1536           }
1537         }
1538       });
1539 
1540       // LU-decompose B (w/o pivoting) and then invert B
1541       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1542       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1543     }));
1544   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1545   PetscFunctionReturn(PETSC_SUCCESS);
1546 }
1547 
1548 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1549 {
1550   Mat_SeqAIJ *aseq;
1551   PetscInt    i, m, n;
1552   auto        exec = PetscGetKokkosExecutionSpace();
1553 
1554   PetscFunctionBegin;
1555   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1556 
1557   m = akok->nrows();
1558   n = akok->ncols();
1559   PetscCall(MatSetSizes(A, m, n, m, n));
1560   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1561 
1562   /* Set up data structures of A as a MATSEQAIJ */
1563   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1564   aseq = (Mat_SeqAIJ *)A->data;
1565 
1566   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1567   PetscCallCXX(akok->j_dual.sync_host(exec));
1568   PetscCallCXX(exec.fence());
1569 
1570   aseq->i       = akok->i_host_data();
1571   aseq->j       = akok->j_host_data();
1572   aseq->a       = akok->a_host_data();
1573   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1574   aseq->free_a  = PETSC_FALSE;
1575   aseq->free_ij = PETSC_FALSE;
1576   aseq->nz      = akok->nnz();
1577   aseq->maxnz   = aseq->nz;
1578 
1579   PetscCall(PetscMalloc1(m, &aseq->imax));
1580   PetscCall(PetscMalloc1(m, &aseq->ilen));
1581   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1582 
1583   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1584   akok->nonzerostate = A->nonzerostate;
1585   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1586   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1587   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1588   PetscFunctionReturn(PETSC_SUCCESS);
1589 }
1590 
1591 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1592 {
1593   PetscFunctionBegin;
1594   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1595   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1596   PetscFunctionReturn(PETSC_SUCCESS);
1597 }
1598 
1599 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1600 {
1601   Mat_SeqAIJKokkos *akok;
1602 
1603   PetscFunctionBegin;
1604   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1605   PetscCall(MatCreate(comm, A));
1606   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1607   PetscFunctionReturn(PETSC_SUCCESS);
1608 }
1609 
1610 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1611 
1612    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1613  */
1614 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1615 {
1616   PetscFunctionBegin;
1617   PetscCall(MatCreate(comm, A));
1618   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1619   PetscFunctionReturn(PETSC_SUCCESS);
1620 }
1621 
1622 /*@C
1623   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1624   (the default parallel PETSc format). This matrix will ultimately be handled by
1625   Kokkos for calculations.
1626 
1627   Collective
1628 
1629   Input Parameters:
1630 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1631 . m    - number of rows
1632 . n    - number of columns
1633 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1634 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1635 
1636   Output Parameter:
1637 . A - the matrix
1638 
1639   Level: intermediate
1640 
1641   Notes:
1642   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1643   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1644   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1645 
1646   The AIJ format, also called
1647   compressed row storage, is fully compatible with standard Fortran
1648   storage.  That is, the stored row and column indices can begin at
1649   either one (as in Fortran) or zero.
1650 
1651   Specify the preallocated storage with either `nz` or `nnz` (not both).
1652   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1653   allocation.
1654 
1655 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1656 @*/
1657 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1658 {
1659   PetscFunctionBegin;
1660   PetscCall(PetscKokkosInitializeCheck());
1661   PetscCall(MatCreate(comm, A));
1662   PetscCall(MatSetSizes(*A, m, n, m, n));
1663   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1664   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1665   PetscFunctionReturn(PETSC_SUCCESS);
1666 }
1667 
1668 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1669 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1670 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1671 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1672 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1673 {
1674   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1675   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1676   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1677 
1678   PetscFunctionBegin;
1679   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1680     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1681     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1682     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1683   }
1684   PetscFunctionReturn(PETSC_SUCCESS);
1685 }
1686 
1687 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1688 {
1689   const PetscInt              n         = A->rmap->n;
1690   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1691   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1692   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1693 
1694   PetscFunctionBegin;
1695   if (!factors->transpose_updated) {
1696     if (has_upper) {
1697       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1698         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1699         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1700         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1701       }
1702 
1703       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1704         if (!factors->U) {
1705           Mat_SeqAIJ *seq;
1706 
1707           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1708           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1709 
1710           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1711           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1712           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1713           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1714         } else {
1715           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1716         }
1717         // Copy Ut from host to device
1718         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1719         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1720         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1721       } else { // If U was computed on device, we also compute the transpose there
1722         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1723         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1724                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1725                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1726                                                                                                                                                                                                                                factors->aUt_d));
1727         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1728       }
1729       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1730     }
1731 
1732     // do the same for L with LU
1733     if (has_lower) {
1734       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1735         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1736         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1737         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1738       }
1739 
1740       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1741         if (!factors->L) {
1742           Mat_SeqAIJ *seq;
1743 
1744           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1745           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1746 
1747           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1748           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1749           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1750           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1751         } else {
1752           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1753         }
1754         // Copy Lt from host to device
1755         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1756         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1757         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1758       } else { // If L was computed on device, we also compute the transpose there
1759         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1760         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1761                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1762                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1763                                                                                                                                                                                                                                factors->aLt_d));
1764         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1765       }
1766       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1767     }
1768 
1769     factors->transpose_updated = PETSC_TRUE;
1770   }
1771   PetscFunctionReturn(PETSC_SUCCESS);
1772 }
1773 
1774 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1775 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1776 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1777 {
1778   auto                        exec    = PetscGetKokkosExecutionSpace();
1779   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1780   PetscInt                    m       = A->rmap->n;
1781   PetscScalarKokkosView       D       = factors->D_d;
1782   PetscScalarKokkosView       X, Y, B; // alias
1783   ConstPetscScalarKokkosView  b;
1784   PetscScalarKokkosView       x;
1785   PetscIntKokkosView         &rowperm  = factors->rowperm;
1786   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1787 
1788   PetscFunctionBegin;
1789   PetscCall(PetscLogGpuTimeBegin());
1790   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1791   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1792   PetscCall(VecGetKokkosView(bb, &b));
1793   PetscCall(VecGetKokkosViewWrite(xx, &x));
1794 
1795   // Solve U^T Y = B
1796   if (identity) { // Reorder b with the row permutation
1797     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1798     Y = factors->workVector;
1799   } else {
1800     B = factors->workVector;
1801     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1802     Y = x;
1803   }
1804   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1805 
1806   // Solve diag(D) Y' = Y.
1807   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1808   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1809 
1810   // Solve UX = Y
1811   if (identity) {
1812     X = x;
1813   } else {
1814     X = factors->workVector; // B is not needed anymore
1815   }
1816   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1817 
1818   // Reorder X with the inverse column (row) permutation
1819   if (!identity) {
1820     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1821   }
1822 
1823   PetscCall(VecRestoreKokkosView(bb, &b));
1824   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1825   PetscCall(PetscLogGpuTimeEnd());
1826   PetscFunctionReturn(PETSC_SUCCESS);
1827 }
1828 
1829 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1830 // R and C are represented by rowperm and colperm in factors.
1831 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1832 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1833 {
1834   auto                        exec    = PetscGetKokkosExecutionSpace();
1835   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1836   PetscInt                    m       = A->rmap->n;
1837   PetscScalarKokkosView       X, Y, B; // alias
1838   ConstPetscScalarKokkosView  b;
1839   PetscScalarKokkosView       x;
1840   PetscIntKokkosView         &rowperm      = factors->rowperm;
1841   PetscIntKokkosView         &colperm      = factors->colperm;
1842   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1843   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1844 
1845   PetscFunctionBegin;
1846   PetscCall(PetscLogGpuTimeBegin());
1847   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1848   PetscCall(VecGetKokkosView(bb, &b));
1849   PetscCall(VecGetKokkosViewWrite(xx, &x));
1850 
1851   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1852   if (row_identity) {
1853     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1854     Y = factors->workVector;
1855   } else {
1856     B = factors->workVector;
1857     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1858     Y = x;
1859   }
1860   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1861 
1862   // Solve U C^- x = Y
1863   if (col_identity) {
1864     X = x;
1865   } else {
1866     X = factors->workVector;
1867   }
1868   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1869 
1870   // x = C X; Reorder X with the inverse col permutation
1871   if (!col_identity) {
1872     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1873   }
1874 
1875   PetscCall(VecRestoreKokkosView(bb, &b));
1876   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1877   PetscCall(PetscLogGpuTimeEnd());
1878   PetscFunctionReturn(PETSC_SUCCESS);
1879 }
1880 
1881 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1882 // R and C are represented by rowperm and colperm in factors.
1883 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1884 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1885 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1886 {
1887   auto                        exec    = PetscGetKokkosExecutionSpace();
1888   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1889   PetscInt                    m       = A->rmap->n;
1890   PetscScalarKokkosView       X, Y, B; // alias
1891   ConstPetscScalarKokkosView  b;
1892   PetscScalarKokkosView       x;
1893   PetscIntKokkosView         &rowperm      = factors->rowperm;
1894   PetscIntKokkosView         &colperm      = factors->colperm;
1895   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1896   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1897 
1898   PetscFunctionBegin;
1899   PetscCall(PetscLogGpuTimeBegin());
1900   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1901   PetscCall(VecGetKokkosView(bb, &b));
1902   PetscCall(VecGetKokkosViewWrite(xx, &x));
1903 
1904   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1905   if (col_identity) { // Reorder b with the col permutation
1906     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1907     Y = factors->workVector;
1908   } else {
1909     B = factors->workVector;
1910     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1911     Y = x;
1912   }
1913   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1914 
1915   // Solve L^T X = Y
1916   if (row_identity) {
1917     X = x;
1918   } else {
1919     X = factors->workVector;
1920   }
1921   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1922 
1923   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1924   if (!row_identity) {
1925     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1926   }
1927 
1928   PetscCall(VecRestoreKokkosView(bb, &b));
1929   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1930   PetscCall(PetscLogGpuTimeEnd());
1931   PetscFunctionReturn(PETSC_SUCCESS);
1932 }
1933 
1934 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1935 {
1936   PetscFunctionBegin;
1937   PetscCall(MatSeqAIJKokkosSyncHost(A));
1938   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1939 
1940   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1941     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1942     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1943     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1944     const MatScalar            *Ba = b->a;
1945     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1946 
1947     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1948       // Allocate memory and copy the structure
1949       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1950       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1951       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1952       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1953       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1954       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1955 
1956       PetscInt *Li = factors->iL_h.data();
1957       PetscInt *Lj = factors->jL_h.data();
1958       PetscInt *Ui = factors->iU_h.data();
1959       PetscInt *Uj = factors->jU_h.data();
1960 
1961       Li[0] = Ui[0] = 0;
1962       for (PetscInt i = 0; i < m; i++) {
1963         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1964         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1965 
1966         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1967         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1968 
1969         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1970         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1971 
1972         Li[i + 1] = Li[i] + llen + 1;
1973         Ui[i + 1] = Ui[i] + ulen;
1974       }
1975 
1976       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1977       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1978       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1979       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1980       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1981       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1982 
1983       // Copy row/col permutation to device
1984       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1985       PetscBool row_identity;
1986       PetscCall(ISIdentity(rowperm, &row_identity));
1987       if (!row_identity) {
1988         const PetscInt *ip;
1989 
1990         PetscCall(ISGetIndices(rowperm, &ip));
1991         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1992         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1993         PetscCall(ISRestoreIndices(rowperm, &ip));
1994         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1995       }
1996 
1997       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1998       PetscBool col_identity;
1999       PetscCall(ISIdentity(colperm, &col_identity));
2000       if (!col_identity) {
2001         const PetscInt *ip;
2002 
2003         PetscCall(ISGetIndices(colperm, &ip));
2004         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2005         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2006         PetscCall(ISRestoreIndices(colperm, &ip));
2007         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2008       }
2009 
2010       /* Create sptrsv handles for L, U and their transpose */
2011 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2012       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2013 #else
2014       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2015 #endif
2016       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2017       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2018       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2019       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2020     }
2021 
2022     // Copy the value
2023     for (PetscInt i = 0; i < m; i++) {
2024       PetscInt        llen = Bi[i + 1] - Bi[i];
2025       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2026       const PetscInt *Li   = factors->iL_h.data();
2027       const PetscInt *Ui   = factors->iU_h.data();
2028 
2029       PetscScalar *La = factors->aL_h.data();
2030       PetscScalar *Ua = factors->aU_h.data();
2031 
2032       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2033       La[Li[i] + llen] = 1.0;                      // diagonal entry
2034 
2035       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2036       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2037     }
2038 
2039     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2040     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2041     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2042     factors->transpose_updated         = PETSC_FALSE;
2043     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2044 
2045     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2046     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2047   }
2048 
2049   B->ops->matsolve          = NULL;
2050   B->ops->matsolvetranspose = NULL;
2051   PetscFunctionReturn(PETSC_SUCCESS);
2052 }
2053 
2054 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2055 {
2056   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2057   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2058   PetscInt                    fill_lev = info->levels;
2059 
2060   PetscFunctionBegin;
2061   PetscCall(PetscLogGpuTimeBegin());
2062   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2063   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2064 
2065   auto a_d = aijkok->a_dual.view_device();
2066   auto i_d = aijkok->i_dual.view_device();
2067   auto j_d = aijkok->j_dual.view_device();
2068 
2069   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2070 
2071   B->assembled              = PETSC_TRUE;
2072   B->preallocated           = PETSC_TRUE;
2073   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2074   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2075   B->ops->matsolve          = NULL;
2076   B->ops->matsolvetranspose = NULL;
2077 
2078   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2079   factors->transpose_updated         = PETSC_FALSE;
2080   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2081   /* TODO: log flops, but how to know that? */
2082   PetscCall(PetscLogGpuTimeEnd());
2083   PetscFunctionReturn(PETSC_SUCCESS);
2084 }
2085 
2086 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2087 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2088 {
2089   Mat_SeqAIJKokkos           *aijkok;
2090   Mat_SeqAIJ                 *b;
2091   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2092   PetscInt                    fill_lev = info->levels;
2093   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2094   PetscInt                    n        = A->rmap->n;
2095 
2096   PetscFunctionBegin;
2097   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2098   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2099 
2100   /* Create a spiluk handle and then do symbolic factorization */
2101   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2102   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2103 
2104   auto spiluk_handle = factors->kh.get_spiluk_handle();
2105 
2106   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2107   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2108   Kokkos::realloc(factors->iU_d, n + 1);
2109   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2110 
2111   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2112   auto i_d = aijkok->i_dual.view_device();
2113   auto j_d = aijkok->j_dual.view_device();
2114   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2115   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2116 
2117   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2118   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2119   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2120   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2121 
2122   /* TODO: add options to select sptrsv algorithms */
2123   /* Create sptrsv handles for L, U and their transpose */
2124 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2125   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2126 #else
2127   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2128 #endif
2129 
2130   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2131   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2132   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2133   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2134 
2135   /* Fill fields of the factor matrix B */
2136   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2137   b     = (Mat_SeqAIJ *)B->data;
2138   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2139   B->info.fill_ratio_given  = info->fill;
2140   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2141 
2142   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2143   PetscFunctionReturn(PETSC_SUCCESS);
2144 }
2145 
2146 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2147 {
2148   PetscFunctionBegin;
2149   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2150   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2151   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2152   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2153   PetscFunctionReturn(PETSC_SUCCESS);
2154 }
2155 
2156 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2157 {
2158   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2159 
2160   PetscFunctionBegin;
2161   if (!info->factoronhost) {
2162     PetscCall(ISIdentity(isrow, &row_identity));
2163     PetscCall(ISIdentity(iscol, &col_identity));
2164   }
2165 
2166   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2167   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2168 
2169   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2170     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2171   } else {
2172     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2173     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2174   }
2175   PetscFunctionReturn(PETSC_SUCCESS);
2176 }
2177 
2178 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2179 {
2180   PetscFunctionBegin;
2181   PetscCall(MatSeqAIJKokkosSyncHost(A));
2182   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2183 
2184   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2185     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2186     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2187     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2188     const MatScalar            *Ba = b->a;
2189     PetscInt                    m  = B->rmap->n;
2190 
2191     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2192       // Allocate memory and copy the structure
2193       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2194       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2195       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2196       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2197       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2198       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2199 
2200       // Build jU_h from the skewed Aj
2201       PetscInt *Uj = factors->jU_h.data();
2202       for (PetscInt i = 0; i < m; i++) {
2203         PetscInt ulen = Bi[i + 1] - Bi[i];
2204         Uj[Bi[i]]     = i;                                              // diagonal entry
2205         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2206       }
2207 
2208       // Copy iU, jU to device
2209       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2210       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2211 
2212       // Copy row/col permutation to device
2213       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2214       PetscBool row_identity;
2215       PetscCall(ISIdentity(rowperm, &row_identity));
2216       if (!row_identity) {
2217         const PetscInt *ip;
2218 
2219         PetscCall(ISGetIndices(rowperm, &ip));
2220         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2221         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2222         PetscCall(ISRestoreIndices(rowperm, &ip));
2223         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2224       }
2225 
2226       // Create sptrsv handles for U and U^T
2227 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2228       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2229 #else
2230       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2231 #endif
2232       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2233       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2234     }
2235     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2236     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2237     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2238 
2239     // Copy the value
2240     PetscScalar *Ua = factors->aU_h.data();
2241     PetscScalar *D  = factors->D_h.data();
2242     for (PetscInt i = 0; i < m; i++) {
2243       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2244       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2245       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2246     }
2247     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2248     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2249 
2250     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2251     factors->transpose_updated         = PETSC_FALSE;
2252   }
2253 
2254   B->ops->matsolve          = NULL;
2255   B->ops->matsolvetranspose = NULL;
2256   PetscFunctionReturn(PETSC_SUCCESS);
2257 }
2258 
2259 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2260 {
2261   PetscFunctionBegin;
2262   if (info->solveonhost) {
2263     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2264     PetscCall(MatSetType(B, MATSEQSBAIJ));
2265     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2266   }
2267 
2268   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2269 
2270   if (!info->solveonhost) {
2271     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2272     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2273     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2274     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2275   }
2276   PetscFunctionReturn(PETSC_SUCCESS);
2277 }
2278 
2279 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2280 {
2281   PetscFunctionBegin;
2282   if (info->solveonhost) {
2283     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2284     PetscCall(MatSetType(B, MATSEQSBAIJ));
2285     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2286   }
2287 
2288   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2289 
2290   if (!info->solveonhost) {
2291     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2292     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2293     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2294     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2295   }
2296   PetscFunctionReturn(PETSC_SUCCESS);
2297 }
2298 
2299 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2300 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2301 {
2302   PetscFunctionBegin;
2303   *type = MATSOLVERKOKKOS;
2304   PetscFunctionReturn(PETSC_SUCCESS);
2305 }
2306 
2307 /*MC
2308   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2309   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2310 
2311   Level: beginner
2312 
2313 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2314 M*/
2315 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2316 {
2317   PetscInt n = A->rmap->n;
2318   MPI_Comm comm;
2319 
2320   PetscFunctionBegin;
2321   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2322   PetscCall(MatCreate(comm, B));
2323   PetscCall(MatSetSizes(*B, n, n, n, n));
2324   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2325   (*B)->factortype = ftype;
2326   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2327   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2328   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2329 
2330   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2331     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2332     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2333     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2334     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2335     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2336   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2337     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2338     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2339     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2340     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2341   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2342 
2343   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2344   (*B)->canuseordering = PETSC_TRUE;
2345   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2346   PetscFunctionReturn(PETSC_SUCCESS);
2347 }
2348 
2349 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2350 {
2351   PetscFunctionBegin;
2352   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2353   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2354   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2355   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2356   PetscFunctionReturn(PETSC_SUCCESS);
2357 }
2358 
2359 /* Utility to print out a KokkosCsrMatrix for debugging */
2360 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2361 {
2362   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2363   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2364   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2365   const PetscInt    *i  = iv.data();
2366   const PetscInt    *j  = jv.data();
2367   const PetscScalar *a  = av.data();
2368   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2369 
2370   PetscFunctionBegin;
2371   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2372   for (PetscInt k = 0; k < m; k++) {
2373     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2374     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2375     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2376   }
2377   PetscFunctionReturn(PETSC_SUCCESS);
2378 }
2379