xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 7f296bb328fcd4c99f2da7bfe8ba7ed8a4ebceee)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsystypes.h>
9 #include <petscerror.h>
10 
11 #include <Kokkos_Core.hpp>
12 #include <KokkosBlas.hpp>
13 #include <KokkosSparse_CrsMatrix.hpp>
14 
15 // To suppress compiler warnings:
16 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
17 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
18 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
19 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
20 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
21 #include <KokkosSparse_spmv.hpp>
22 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
23 
24 #include <KokkosSparse_spiluk.hpp>
25 #include <KokkosSparse_sptrsv.hpp>
26 #include <KokkosSparse_spgemm.hpp>
27 #include <KokkosSparse_spadd.hpp>
28 #include <KokkosBatched_LU_Decl.hpp>
29 #include <KokkosBatched_InverseLU_Decl.hpp>
30 
31 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
32 
33 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
34   #include <KokkosSparse_Utils.hpp>
35 using KokkosSparse::sort_crs_matrix;
36 using KokkosSparse::Impl::transpose_matrix;
37 #else
38   #include <KokkosKernels_Sorting.hpp>
39 using KokkosKernels::sort_crs_matrix;
40 using KokkosKernels::Impl::transpose_matrix;
41 #endif
42 
43 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
44 using KokkosSparse::spiluk_symbolic;
45 using KokkosSparse::spiluk_numeric;
46 using KokkosSparse::sptrsv_symbolic;
47 using KokkosSparse::sptrsv_solve;
48 using KokkosSparse::Experimental::SPTRSVAlgorithm;
49 using KokkosSparse::Experimental::SPILUKAlgorithm;
50 #else
51 using KokkosSparse::Experimental::spiluk_symbolic;
52 using KokkosSparse::Experimental::spiluk_numeric;
53 using KokkosSparse::Experimental::sptrsv_symbolic;
54 using KokkosSparse::Experimental::sptrsv_solve;
55 using KokkosSparse::Experimental::SPTRSVAlgorithm;
56 using KokkosSparse::Experimental::SPILUKAlgorithm;
57 #endif
58 
59 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
60 
61 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
62    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
63    In the latter case, it is important to set a_dual's sync state correctly.
64  */
65 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
66 {
67   Mat_SeqAIJ       *aijseq;
68   Mat_SeqAIJKokkos *aijkok;
69 
70   PetscFunctionBegin;
71   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
72   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
73 
74   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
75   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
76 
77   /* If aijkok does not exist, we just copy i, j to device.
78      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
79      In both cases, we build a new aijkok structure.
80   */
81   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
82     delete aijkok;
83     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
84     A->spptr = aijkok;
85   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
86     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
87     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
88     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
89   }
90   PetscFunctionReturn(PETSC_SUCCESS);
91 }
92 
93 /* Sync CSR data to device if not yet */
94 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
95 {
96   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
97 
98   PetscFunctionBegin;
99   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
100   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
101   if (aijkok->a_dual.need_sync_device()) {
102     aijkok->a_dual.sync_device();
103     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
104     aijkok->hermitian_updated = PETSC_FALSE;
105   }
106   PetscFunctionReturn(PETSC_SUCCESS);
107 }
108 
109 /* Mark the CSR data on device as modified */
110 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
111 {
112   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
113 
114   PetscFunctionBegin;
115   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
116   aijkok->a_dual.clear_sync_state();
117   aijkok->a_dual.modify_device();
118   aijkok->transpose_updated = PETSC_FALSE;
119   aijkok->hermitian_updated = PETSC_FALSE;
120   PetscCall(MatSeqAIJInvalidateDiagonal(A));
121   PetscCall(PetscObjectStateIncrease((PetscObject)A));
122   PetscFunctionReturn(PETSC_SUCCESS);
123 }
124 
125 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
126 {
127   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
128   auto              exec   = PetscGetKokkosExecutionSpace();
129 
130   PetscFunctionBegin;
131   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
132   /* We do not expect one needs factors on host  */
133   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
134   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
135   PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(aijkok->a_dual, exec));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
140 {
141   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142 
143   PetscFunctionBegin;
144   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
145     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
146     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
147     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
148   */
149   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
150     auto exec = PetscGetKokkosExecutionSpace();
151     PetscCallCXX(aijkok->a_dual.sync_host(exec));
152     PetscCallCXX(exec.fence());
153     *array = aijkok->a_dual.view_host().data();
154   } else { /* Happens when calling MatSetValues on a newly created matrix */
155     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
156   }
157   PetscFunctionReturn(PETSC_SUCCESS);
158 }
159 
160 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
161 {
162   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
163 
164   PetscFunctionBegin;
165   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
166   PetscFunctionReturn(PETSC_SUCCESS);
167 }
168 
169 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
170 {
171   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
172 
173   PetscFunctionBegin;
174   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
175     auto exec = PetscGetKokkosExecutionSpace();
176     PetscCallCXX(aijkok->a_dual.sync_host(exec));
177     PetscCallCXX(exec.fence());
178     *array = aijkok->a_dual.view_host().data();
179   } else {
180     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
181   }
182   PetscFunctionReturn(PETSC_SUCCESS);
183 }
184 
185 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
186 {
187   PetscFunctionBegin;
188   *array = NULL;
189   PetscFunctionReturn(PETSC_SUCCESS);
190 }
191 
192 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
193 {
194   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
195 
196   PetscFunctionBegin;
197   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
198     *array = aijkok->a_dual.view_host().data();
199   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
200     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
201   }
202   PetscFunctionReturn(PETSC_SUCCESS);
203 }
204 
205 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
206 {
207   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
208 
209   PetscFunctionBegin;
210   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
211     aijkok->a_dual.clear_sync_state();
212     aijkok->a_dual.modify_host();
213   }
214   PetscFunctionReturn(PETSC_SUCCESS);
215 }
216 
217 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
218 {
219   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
220 
221   PetscFunctionBegin;
222   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
223 
224   if (i) *i = aijkok->i_device_data();
225   if (j) *j = aijkok->j_device_data();
226   if (a) {
227     aijkok->a_dual.sync_device();
228     *a = aijkok->a_device_data();
229   }
230   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
231   PetscFunctionReturn(PETSC_SUCCESS);
232 }
233 
234 /*
235   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
236 
237   Input Parameter:
238 .  A       - the MATSEQAIJKOKKOS matrix
239 
240   Output Parameters:
241 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
242 -  T_d    - the transpose on device, whose value array is allocated but not initialized
243 */
244 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
245 {
246   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
247   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
248   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
249   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
250   MatRowMapType          *Ti = Ti_h.data();
251   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
252   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
253   PetscInt               *Tj   = Tj_h.data();
254   PetscInt               *perm = perm_h.data();
255   PetscInt               *offset;
256 
257   PetscFunctionBegin;
258   // Populate Ti
259   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
260   Ti++;
261   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
262   Ti--;
263   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
264 
265   // Populate Tj and the permutation array
266   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
267   for (PetscInt i = 0; i < m; i++) {
268     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
269       PetscInt r    = Aj[j];                       // row r of T
270       PetscInt disp = Ti[r] + offset[r];
271 
272       Tj[disp]   = i; // col i of T
273       perm[disp] = j;
274       offset[r]++;
275     }
276   }
277   PetscCall(PetscFree(offset));
278 
279   // Sort each row of T, along with the permutation array
280   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
281 
282   // Output perm and T on device
283   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
284   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
285   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
286   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
287   PetscFunctionReturn(PETSC_SUCCESS);
288 }
289 
290 // Generate the transpose on device and cache it internally
291 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
292 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
293 {
294   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
295   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
296   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
297   KokkosCsrMatrix  &T = akok->csrmatT;
298 
299   PetscFunctionBegin;
300   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
301   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
302 
303   const auto &Aa = akok->a_dual.view_device();
304 
305   if (A->symmetric == PETSC_BOOL3_TRUE) {
306     *csrmatT = akok->csrmat;
307   } else {
308     // See if we already have a cached transpose and its value is up to date
309     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
310       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
311         const auto &perm = akok->transpose_perm; // get the permutation array
312         auto       &Ta   = T.values;
313 
314         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
315       }
316     } else { // Generate T of size n x m for the first time
317       MatRowMapKokkosView perm;
318 
319       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
320       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
321       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
322     }
323     akok->transpose_updated = PETSC_TRUE;
324     *csrmatT                = akok->csrmatT;
325   }
326   PetscFunctionReturn(PETSC_SUCCESS);
327 }
328 
329 // Generate the Hermitian on device and cache it internally
330 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
331 {
332   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
333   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
334   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
335   KokkosCsrMatrix  &T = akok->csrmatH;
336 
337   PetscFunctionBegin;
338   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
339   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
340 
341   const auto &Aa = akok->a_dual.view_device();
342 
343   if (A->hermitian == PETSC_BOOL3_TRUE) {
344     *csrmatH = akok->csrmat;
345   } else {
346     // See if we already have a cached hermitian and its value is up to date
347     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
348       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
349         const auto &perm = akok->transpose_perm; // get the permutation array
350         auto       &Ta   = T.values;
351 
352         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
353       }
354     } else { // Generate T of size n x m for the first time
355       MatRowMapKokkosView perm;
356 
357       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
358       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
359       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
360     }
361     akok->hermitian_updated = PETSC_TRUE;
362     *csrmatH                = akok->csrmatH;
363   }
364   PetscFunctionReturn(PETSC_SUCCESS);
365 }
366 
367 /* y = A x */
368 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
369 {
370   Mat_SeqAIJKokkos          *aijkok;
371   ConstPetscScalarKokkosView xv;
372   PetscScalarKokkosView      yv;
373 
374   PetscFunctionBegin;
375   PetscCall(PetscLogGpuTimeBegin());
376   PetscCall(MatSeqAIJKokkosSyncDevice(A));
377   PetscCall(VecGetKokkosView(xx, &xv));
378   PetscCall(VecGetKokkosViewWrite(yy, &yv));
379   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
380   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
381   PetscCall(VecRestoreKokkosView(xx, &xv));
382   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
383   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
384   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
385   PetscCall(PetscLogGpuTimeEnd());
386   PetscFunctionReturn(PETSC_SUCCESS);
387 }
388 
389 /* y = A^T x */
390 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
391 {
392   Mat_SeqAIJKokkos          *aijkok;
393   const char                *mode;
394   ConstPetscScalarKokkosView xv;
395   PetscScalarKokkosView      yv;
396   KokkosCsrMatrix            csrmat;
397 
398   PetscFunctionBegin;
399   PetscCall(PetscLogGpuTimeBegin());
400   PetscCall(MatSeqAIJKokkosSyncDevice(A));
401   PetscCall(VecGetKokkosView(xx, &xv));
402   PetscCall(VecGetKokkosViewWrite(yy, &yv));
403   if (A->form_explicit_transpose) {
404     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
405     mode = "N";
406   } else {
407     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
408     csrmat = aijkok->csrmat;
409     mode   = "T";
410   }
411   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
412   PetscCall(VecRestoreKokkosView(xx, &xv));
413   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
414   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
415   PetscCall(PetscLogGpuTimeEnd());
416   PetscFunctionReturn(PETSC_SUCCESS);
417 }
418 
419 /* y = A^H x */
420 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
421 {
422   Mat_SeqAIJKokkos          *aijkok;
423   const char                *mode;
424   ConstPetscScalarKokkosView xv;
425   PetscScalarKokkosView      yv;
426   KokkosCsrMatrix            csrmat;
427 
428   PetscFunctionBegin;
429   PetscCall(PetscLogGpuTimeBegin());
430   PetscCall(MatSeqAIJKokkosSyncDevice(A));
431   PetscCall(VecGetKokkosView(xx, &xv));
432   PetscCall(VecGetKokkosViewWrite(yy, &yv));
433   if (A->form_explicit_transpose) {
434     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
435     mode = "N";
436   } else {
437     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
438     csrmat = aijkok->csrmat;
439     mode   = "C";
440   }
441   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
442   PetscCall(VecRestoreKokkosView(xx, &xv));
443   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
444   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
445   PetscCall(PetscLogGpuTimeEnd());
446   PetscFunctionReturn(PETSC_SUCCESS);
447 }
448 
449 /* z = A x + y */
450 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
451 {
452   Mat_SeqAIJKokkos          *aijkok;
453   ConstPetscScalarKokkosView xv;
454   PetscScalarKokkosView      zv;
455 
456   PetscFunctionBegin;
457   PetscCall(PetscLogGpuTimeBegin());
458   PetscCall(MatSeqAIJKokkosSyncDevice(A));
459   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
460   PetscCall(VecGetKokkosView(xx, &xv));
461   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
462   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
463   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
464   PetscCall(VecRestoreKokkosView(xx, &xv));
465   PetscCall(VecRestoreKokkosView(zz, &zv));
466   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
467   PetscCall(PetscLogGpuTimeEnd());
468   PetscFunctionReturn(PETSC_SUCCESS);
469 }
470 
471 /* z = A^T x + y */
472 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
473 {
474   Mat_SeqAIJKokkos          *aijkok;
475   const char                *mode;
476   ConstPetscScalarKokkosView xv;
477   PetscScalarKokkosView      zv;
478   KokkosCsrMatrix            csrmat;
479 
480   PetscFunctionBegin;
481   PetscCall(PetscLogGpuTimeBegin());
482   PetscCall(MatSeqAIJKokkosSyncDevice(A));
483   if (zz != yy) PetscCall(VecCopy(yy, zz));
484   PetscCall(VecGetKokkosView(xx, &xv));
485   PetscCall(VecGetKokkosView(zz, &zv));
486   if (A->form_explicit_transpose) {
487     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
488     mode = "N";
489   } else {
490     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
491     csrmat = aijkok->csrmat;
492     mode   = "T";
493   }
494   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
495   PetscCall(VecRestoreKokkosView(xx, &xv));
496   PetscCall(VecRestoreKokkosView(zz, &zv));
497   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
498   PetscCall(PetscLogGpuTimeEnd());
499   PetscFunctionReturn(PETSC_SUCCESS);
500 }
501 
502 /* z = A^H x + y */
503 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
504 {
505   Mat_SeqAIJKokkos          *aijkok;
506   const char                *mode;
507   ConstPetscScalarKokkosView xv;
508   PetscScalarKokkosView      zv;
509   KokkosCsrMatrix            csrmat;
510 
511   PetscFunctionBegin;
512   PetscCall(PetscLogGpuTimeBegin());
513   PetscCall(MatSeqAIJKokkosSyncDevice(A));
514   if (zz != yy) PetscCall(VecCopy(yy, zz));
515   PetscCall(VecGetKokkosView(xx, &xv));
516   PetscCall(VecGetKokkosView(zz, &zv));
517   if (A->form_explicit_transpose) {
518     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
519     mode = "N";
520   } else {
521     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
522     csrmat = aijkok->csrmat;
523     mode   = "C";
524   }
525   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
526   PetscCall(VecRestoreKokkosView(xx, &xv));
527   PetscCall(VecRestoreKokkosView(zz, &zv));
528   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
529   PetscCall(PetscLogGpuTimeEnd());
530   PetscFunctionReturn(PETSC_SUCCESS);
531 }
532 
533 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
534 {
535   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
536 
537   PetscFunctionBegin;
538   switch (op) {
539   case MAT_FORM_EXPLICIT_TRANSPOSE:
540     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
541     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
542     A->form_explicit_transpose = flg;
543     break;
544   default:
545     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
546     break;
547   }
548   PetscFunctionReturn(PETSC_SUCCESS);
549 }
550 
551 /* Depending on reuse, either build a new mat, or use the existing mat */
552 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
553 {
554   Mat_SeqAIJ *aseq;
555 
556   PetscFunctionBegin;
557   PetscCall(PetscKokkosInitializeCheck());
558   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
559     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
560   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
561     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
562   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
563     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
564     PetscCall(PetscFree(A->defaultvectype));
565     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
566     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
567     PetscCall(MatSetOps_SeqAIJKokkos(A));
568     aseq = static_cast<Mat_SeqAIJ *>(A->data);
569     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
570       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
571       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
572     }
573   }
574   PetscFunctionReturn(PETSC_SUCCESS);
575 }
576 
577 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
578    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
579  */
580 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
581 {
582   Mat_SeqAIJ       *bseq;
583   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
584   Mat               mat;
585 
586   PetscFunctionBegin;
587   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
588   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
589   mat = *B;
590   if (A->assembled) {
591     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
592     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
593     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
594     /* Now copy values to B if needed */
595     if (dupOption == MAT_COPY_VALUES) {
596       if (akok->a_dual.need_sync_device()) {
597         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
598         bkok->a_dual.modify_host();
599       } else { /* If device has the latest data, we only copy data on device */
600         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
601         bkok->a_dual.modify_device();
602       }
603     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
604       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
605       bkok->a_dual.modify_host();
606     }
607     mat->spptr = bkok;
608   }
609 
610   PetscCall(PetscFree(mat->defaultvectype));
611   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
612   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
613   PetscCall(MatSetOps_SeqAIJKokkos(mat));
614   PetscFunctionReturn(PETSC_SUCCESS);
615 }
616 
617 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
618 {
619   Mat               At;
620   KokkosCsrMatrix   internT;
621   Mat_SeqAIJKokkos *atkok, *bkok;
622 
623   PetscFunctionBegin;
624   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
625   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
626   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
627     /* Deep copy internT, as we want to isolate the internal transpose */
628     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
629     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
630     if (reuse == MAT_INITIAL_MATRIX) *B = At;
631     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
632   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
633     if ((*B)->assembled) {
634       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
635       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
636       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
637     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
638       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
639       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
640       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
641       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
642       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
643     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
644   }
645   PetscFunctionReturn(PETSC_SUCCESS);
646 }
647 
648 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
649 {
650   Mat_SeqAIJKokkos *aijkok;
651 
652   PetscFunctionBegin;
653   if (A->factortype == MAT_FACTOR_NONE) {
654     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
655     delete aijkok;
656   } else {
657     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
658   }
659   A->spptr = NULL;
660   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
661   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
662   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
663 #if defined(PETSC_HAVE_HYPRE)
664   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
665 #endif
666   PetscCall(MatDestroy_SeqAIJ(A));
667   PetscFunctionReturn(PETSC_SUCCESS);
668 }
669 
670 /*MC
671    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
672 
673    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
674 
675    Options Database Key:
676 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
677 
678   Level: beginner
679 
680 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
681 M*/
682 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
683 {
684   PetscFunctionBegin;
685   PetscCall(PetscKokkosInitializeCheck());
686   PetscCall(MatCreate_SeqAIJ(A));
687   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
688   PetscFunctionReturn(PETSC_SUCCESS);
689 }
690 
691 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
692 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
693 {
694   Mat_SeqAIJ         *a, *b;
695   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
696   MatScalarKokkosView aa, ba, ca;
697   MatRowMapKokkosView ai, bi, ci;
698   MatColIdxKokkosView aj, bj, cj;
699   PetscInt            m, n, nnz, aN;
700 
701   PetscFunctionBegin;
702   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
703   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
704   PetscAssertPointer(C, 4);
705   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
706   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
707   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
708   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
709 
710   PetscCall(MatSeqAIJKokkosSyncDevice(A));
711   PetscCall(MatSeqAIJKokkosSyncDevice(B));
712   a    = static_cast<Mat_SeqAIJ *>(A->data);
713   b    = static_cast<Mat_SeqAIJ *>(B->data);
714   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
715   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
716   aa   = akok->a_dual.view_device();
717   ai   = akok->i_dual.view_device();
718   ba   = bkok->a_dual.view_device();
719   bi   = bkok->i_dual.view_device();
720   m    = A->rmap->n; /* M, N and nnz of C */
721   n    = A->cmap->n + B->cmap->n;
722   nnz  = a->nz + b->nz;
723   aN   = A->cmap->n; /* N of A */
724   if (reuse == MAT_INITIAL_MATRIX) {
725     aj           = akok->j_dual.view_device();
726     bj           = bkok->j_dual.view_device();
727     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
728     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
729     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
730     ca           = ca_dual.view_device();
731     ci           = ci_dual.view_device();
732     cj           = cj_dual.view_device();
733 
734     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
735     Kokkos::parallel_for(
736       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
737         PetscInt i       = t.league_rank(); /* row i */
738         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
739 
740         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
741                                                    ci(i) = coffset;
742                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
743         });
744 
745         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
746           if (k < alen) {
747             ca(coffset + k) = aa(ai(i) + k);
748             cj(coffset + k) = aj(ai(i) + k);
749           } else {
750             ca(coffset + k) = ba(bi(i) + k - alen);
751             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
752           }
753         });
754       });
755     ca_dual.modify_device();
756     ci_dual.modify_device();
757     cj_dual.modify_device();
758     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
759     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
760   } else if (reuse == MAT_REUSE_MATRIX) {
761     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
762     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
763     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
764     ca   = ckok->a_dual.view_device();
765     ci   = ckok->i_dual.view_device();
766 
767     Kokkos::parallel_for(
768       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
769         PetscInt i    = t.league_rank(); /* row i */
770         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
771         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
772           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
773           else ca(ci(i) + k) = ba(bi(i) + k - alen);
774         });
775       });
776     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
777   }
778   PetscFunctionReturn(PETSC_SUCCESS);
779 }
780 
781 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
782 {
783   PetscFunctionBegin;
784   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
785   PetscFunctionReturn(PETSC_SUCCESS);
786 }
787 
788 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
789 {
790   Mat_Product                 *product = C->product;
791   Mat                          A, B;
792   bool                         transA, transB; /* use bool, since KK needs this type */
793   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
794   Mat_SeqAIJ                  *c;
795   MatProductData_SeqAIJKokkos *pdata;
796   KokkosCsrMatrix              csrmatA, csrmatB;
797 
798   PetscFunctionBegin;
799   MatCheckProduct(C, 1);
800   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
801   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
802 
803   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
804   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
805   // we still do numeric.
806   if (pdata->reusesym) { // numeric reuses results from symbolic
807     pdata->reusesym = PETSC_FALSE;
808     PetscFunctionReturn(PETSC_SUCCESS);
809   }
810 
811   switch (product->type) {
812   case MATPRODUCT_AB:
813     transA = false;
814     transB = false;
815     break;
816   case MATPRODUCT_AtB:
817     transA = true;
818     transB = false;
819     break;
820   case MATPRODUCT_ABt:
821     transA = false;
822     transB = true;
823     break;
824   default:
825     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
826   }
827 
828   A = product->A;
829   B = product->B;
830   PetscCall(MatSeqAIJKokkosSyncDevice(A));
831   PetscCall(MatSeqAIJKokkosSyncDevice(B));
832   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
833   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
834   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
835 
836   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
837 
838   csrmatA = akok->csrmat;
839   csrmatB = bkok->csrmat;
840 
841   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
842   if (transA) {
843     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
844     transA = false;
845   }
846 
847   if (transB) {
848     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
849     transB = false;
850   }
851   PetscCall(PetscLogGpuTimeBegin());
852   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
853 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
854   auto spgemmHandle = pdata->kh.get_spgemm_handle();
855   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
856 #endif
857 
858   PetscCall(PetscLogGpuTimeEnd());
859   PetscCall(MatSeqAIJKokkosModifyDevice(C));
860   /* shorter version of MatAssemblyEnd_SeqAIJ */
861   c = (Mat_SeqAIJ *)C->data;
862   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
863   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
864   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
865   c->reallocs         = 0;
866   C->info.mallocs     = 0;
867   C->info.nz_unneeded = 0;
868   C->assembled = C->was_assembled = PETSC_TRUE;
869   C->num_ass++;
870   PetscFunctionReturn(PETSC_SUCCESS);
871 }
872 
873 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
874 {
875   Mat_Product                 *product = C->product;
876   MatProductType               ptype;
877   Mat                          A, B;
878   bool                         transA, transB;
879   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
880   MatProductData_SeqAIJKokkos *pdata;
881   MPI_Comm                     comm;
882   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
883 
884   PetscFunctionBegin;
885   MatCheckProduct(C, 1);
886   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
887   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
888   A = product->A;
889   B = product->B;
890   PetscCall(MatSeqAIJKokkosSyncDevice(A));
891   PetscCall(MatSeqAIJKokkosSyncDevice(B));
892   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
893   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
894   csrmatA = akok->csrmat;
895   csrmatB = bkok->csrmat;
896 
897   ptype = product->type;
898   // Take advantage of the symmetry if true
899   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
900     ptype                                          = MATPRODUCT_AB;
901     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
902   }
903   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
904     ptype                                          = MATPRODUCT_AB;
905     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
906   }
907 
908   switch (ptype) {
909   case MATPRODUCT_AB:
910     transA = false;
911     transB = false;
912     PetscCall(MatSetBlockSizesFromMats(C, A, B));
913     break;
914   case MATPRODUCT_AtB:
915     transA = true;
916     transB = false;
917     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
918     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
919     break;
920   case MATPRODUCT_ABt:
921     transA = false;
922     transB = true;
923     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
924     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
925     break;
926   default:
927     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
928   }
929   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
930   pdata->reusesym = product->api_user;
931 
932   /* TODO: add command line options to select spgemm algorithms */
933   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
934 
935   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
936 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
937   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
938   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
939   #endif
940 #endif
941   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
942 
943   PetscCall(PetscLogGpuTimeBegin());
944   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
945   if (transA) {
946     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
947     transA = false;
948   }
949 
950   if (transB) {
951     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
952     transB = false;
953   }
954 
955   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
956   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
957     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
958     calling new Mat_SeqAIJKokkos().
959     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
960   */
961   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
962 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
963   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
964   auto spgemmHandle = pdata->kh.get_spgemm_handle();
965   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
966 #endif
967   PetscCall(PetscLogGpuTimeEnd());
968 
969   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
970   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
971   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
972   PetscFunctionReturn(PETSC_SUCCESS);
973 }
974 
975 /* handles sparse matrix matrix ops */
976 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
977 {
978   Mat_Product *product = mat->product;
979   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
980 
981   PetscFunctionBegin;
982   MatCheckProduct(mat, 1);
983   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
984   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
985   if (Biskok && Ciskok) {
986     switch (product->type) {
987     case MATPRODUCT_AB:
988     case MATPRODUCT_AtB:
989     case MATPRODUCT_ABt:
990       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
991       break;
992     case MATPRODUCT_PtAP:
993     case MATPRODUCT_RARt:
994     case MATPRODUCT_ABC:
995       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
996       break;
997     default:
998       break;
999     }
1000   } else { /* fallback for AIJ */
1001     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1002   }
1003   PetscFunctionReturn(PETSC_SUCCESS);
1004 }
1005 
1006 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1007 {
1008   Mat_SeqAIJKokkos *aijkok;
1009 
1010   PetscFunctionBegin;
1011   PetscCall(PetscLogGpuTimeBegin());
1012   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1013   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1014   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1015   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1016   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1017   PetscCall(PetscLogGpuTimeEnd());
1018   PetscFunctionReturn(PETSC_SUCCESS);
1019 }
1020 
1021 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1022 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1023 {
1024   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1025 
1026   PetscFunctionBegin;
1027   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1028     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1029 
1030     PetscCall(PetscLogGpuTimeBegin());
1031     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1032     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1033     const auto &Aa     = aijkok->a_dual.view_device();
1034     const auto &Adiag  = aijkok->diag_dual.view_device();
1035     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1036     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1037     PetscCall(PetscLogGpuFlops(n));
1038     PetscCall(PetscLogGpuTimeEnd());
1039   } else { // need reassembly, very slow!
1040     PetscCall(MatShift_Basic(A, a));
1041   }
1042   PetscFunctionReturn(PETSC_SUCCESS);
1043 }
1044 
1045 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1046 {
1047   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1048 
1049   PetscFunctionBegin;
1050   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1051     ConstPetscScalarKokkosView dv;
1052     PetscInt                   n, nv;
1053 
1054     PetscCall(PetscLogGpuTimeBegin());
1055     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1056     PetscCall(VecGetKokkosView(D, &dv));
1057     PetscCall(VecGetLocalSize(D, &nv));
1058     n = PetscMin(Y->rmap->n, Y->cmap->n);
1059     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1060 
1061     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1062     const auto &Aa     = aijkok->a_dual.view_device();
1063     const auto &Adiag  = aijkok->diag_dual.view_device();
1064     PetscCallCXX(Kokkos::parallel_for(
1065       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1066         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1067         else Aa(Adiag(i)) += dv(i);
1068       }));
1069     PetscCall(VecRestoreKokkosView(D, &dv));
1070     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1071     PetscCall(PetscLogGpuFlops(n));
1072     PetscCall(PetscLogGpuTimeEnd());
1073   } else { // need reassembly, very slow!
1074     PetscCall(MatDiagonalSet_Default(Y, D, is));
1075   }
1076   PetscFunctionReturn(PETSC_SUCCESS);
1077 }
1078 
1079 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1080 {
1081   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1082   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1083   ConstPetscScalarKokkosView lv, rv;
1084 
1085   PetscFunctionBegin;
1086   PetscCall(PetscLogGpuTimeBegin());
1087   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1088   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1089   const auto &Aa     = aijkok->a_dual.view_device();
1090   const auto &Ai     = aijkok->i_dual.view_device();
1091   const auto &Aj     = aijkok->j_dual.view_device();
1092   if (ll) {
1093     PetscCall(VecGetLocalSize(ll, &m));
1094     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1095     PetscCall(VecGetKokkosView(ll, &lv));
1096     PetscCallCXX(Kokkos::parallel_for( // for each row
1097       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1098         PetscInt i   = t.league_rank(); // row i
1099         PetscInt len = Ai(i + 1) - Ai(i);
1100         // scale entries on the row
1101         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1102       }));
1103     PetscCall(VecRestoreKokkosView(ll, &lv));
1104     PetscCall(PetscLogGpuFlops(nz));
1105   }
1106   if (rr) {
1107     PetscCall(VecGetLocalSize(rr, &n));
1108     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1109     PetscCall(VecGetKokkosView(rr, &rv));
1110     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1111       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1112     PetscCall(VecRestoreKokkosView(rr, &lv));
1113     PetscCall(PetscLogGpuFlops(nz));
1114   }
1115   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1116   PetscCall(PetscLogGpuTimeEnd());
1117   PetscFunctionReturn(PETSC_SUCCESS);
1118 }
1119 
1120 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1121 {
1122   Mat_SeqAIJKokkos *aijkok;
1123 
1124   PetscFunctionBegin;
1125   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1126   if (aijkok) { /* Only zero the device if data is already there */
1127     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1128     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1129   } else { /* Might be preallocated but not assembled */
1130     PetscCall(MatZeroEntries_SeqAIJ(A));
1131   }
1132   PetscFunctionReturn(PETSC_SUCCESS);
1133 }
1134 
1135 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1136 {
1137   Mat_SeqAIJKokkos     *aijkok;
1138   PetscInt              n;
1139   PetscScalarKokkosView xv;
1140 
1141   PetscFunctionBegin;
1142   PetscCall(VecGetLocalSize(x, &n));
1143   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1144   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1145 
1146   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1147   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1148 
1149   const auto &Aa    = aijkok->a_dual.view_device();
1150   const auto &Ai    = aijkok->i_dual.view_device();
1151   const auto &Adiag = aijkok->diag_dual.view_device();
1152 
1153   PetscCall(VecGetKokkosViewWrite(x, &xv));
1154   Kokkos::parallel_for(
1155     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1156       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1157       else xv(i) = 0;
1158     });
1159   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1160   PetscFunctionReturn(PETSC_SUCCESS);
1161 }
1162 
1163 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1164 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1165 {
1166   Mat_SeqAIJKokkos *aijkok;
1167 
1168   PetscFunctionBegin;
1169   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1170   PetscAssertPointer(kv, 2);
1171   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1172   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1173   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1174   *kv    = aijkok->a_dual.view_device();
1175   PetscFunctionReturn(PETSC_SUCCESS);
1176 }
1177 
1178 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1179 {
1180   PetscFunctionBegin;
1181   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1182   PetscAssertPointer(kv, 2);
1183   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1184   PetscFunctionReturn(PETSC_SUCCESS);
1185 }
1186 
1187 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1188 {
1189   Mat_SeqAIJKokkos *aijkok;
1190 
1191   PetscFunctionBegin;
1192   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1193   PetscAssertPointer(kv, 2);
1194   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1195   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1196   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1197   *kv    = aijkok->a_dual.view_device();
1198   PetscFunctionReturn(PETSC_SUCCESS);
1199 }
1200 
1201 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1202 {
1203   PetscFunctionBegin;
1204   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1205   PetscAssertPointer(kv, 2);
1206   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1207   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1208   PetscFunctionReturn(PETSC_SUCCESS);
1209 }
1210 
1211 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1212 {
1213   Mat_SeqAIJKokkos *aijkok;
1214 
1215   PetscFunctionBegin;
1216   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1217   PetscAssertPointer(kv, 2);
1218   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1219   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1220   *kv    = aijkok->a_dual.view_device();
1221   PetscFunctionReturn(PETSC_SUCCESS);
1222 }
1223 
1224 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1225 {
1226   PetscFunctionBegin;
1227   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1228   PetscAssertPointer(kv, 2);
1229   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1230   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1231   PetscFunctionReturn(PETSC_SUCCESS);
1232 }
1233 
1234 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1235 {
1236   Mat_SeqAIJKokkos *akok;
1237 
1238   PetscFunctionBegin;
1239   auto exec = PetscGetKokkosExecutionSpace();
1240   // Create host copies of the input aij
1241   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1242   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1243   // Don't copy the vals to the host now
1244   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1245 
1246   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1247   // Note we have modified device data so it will copy lazily
1248   a_dual.modify_device();
1249   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1250   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1251 
1252   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1253   PetscCall(MatCreate(comm, A));
1254   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1255   PetscFunctionReturn(PETSC_SUCCESS);
1256 }
1257 
1258 /* Computes Y += alpha X */
1259 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1260 {
1261   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1262   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1263   ConstMatScalarKokkosView Xa;
1264   MatScalarKokkosView      Ya;
1265   auto                     exec = PetscGetKokkosExecutionSpace();
1266 
1267   PetscFunctionBegin;
1268   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1269   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1270   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1271   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1272   PetscCall(PetscLogGpuTimeBegin());
1273 
1274   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1275     PetscBool e;
1276     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1277     if (e) {
1278       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1279       if (e) pattern = SAME_NONZERO_PATTERN;
1280     }
1281   }
1282 
1283   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1284     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1285     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1286     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1287   */
1288   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1289   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1290   Xa   = xkok->a_dual.view_device();
1291   Ya   = ykok->a_dual.view_device();
1292 
1293   if (pattern == SAME_NONZERO_PATTERN) {
1294     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1295     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1296   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1297     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1298     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1299 
1300     Kokkos::parallel_for(
1301       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1302         PetscInt i = t.league_rank(); // row i
1303         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1304           // Only one thread works in a team
1305           PetscInt p, q = Yi(i);
1306           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1307             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1308             if (Xj(p) == Yj(q)) {                        // Found it
1309               Ya(q) += alpha * Xa(p);
1310               q++;
1311             } else {
1312             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1313             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1314 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1315               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1316 #else
1317               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1318 #endif
1319             }
1320           }
1321         });
1322       });
1323     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1324   } else { // different nonzero patterns
1325     Mat             Z;
1326     KokkosCsrMatrix zcsr;
1327     KernelHandle    kh;
1328     kh.create_spadd_handle(true); // X, Y are sorted
1329     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1330     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1331     zkok = new Mat_SeqAIJKokkos(zcsr);
1332     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1333     PetscCall(MatHeaderReplace(Y, &Z));
1334     kh.destroy_spadd_handle();
1335   }
1336   PetscCall(PetscLogGpuTimeEnd());
1337   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1338   PetscFunctionReturn(PETSC_SUCCESS);
1339 }
1340 
1341 struct MatCOOStruct_SeqAIJKokkos {
1342   PetscCount           n;
1343   PetscCount           Atot;
1344   PetscInt             nz;
1345   PetscCountKokkosView jmap;
1346   PetscCountKokkosView perm;
1347 
1348   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1349   {
1350     nz   = coo_h->nz;
1351     n    = coo_h->n;
1352     Atot = coo_h->Atot;
1353     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1354     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1355   }
1356 };
1357 
1358 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1359 {
1360   PetscFunctionBegin;
1361   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1362   PetscFunctionReturn(PETSC_SUCCESS);
1363 }
1364 
1365 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1366 {
1367   Mat_SeqAIJKokkos          *akok;
1368   Mat_SeqAIJ                *aseq;
1369   PetscContainer             container_h;
1370   MatCOOStruct_SeqAIJ       *coo_h;
1371   MatCOOStruct_SeqAIJKokkos *coo_d;
1372 
1373   PetscFunctionBegin;
1374   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1375   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1376   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1377   delete akok;
1378   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1379   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1380 
1381   // Copy the COO struct to device
1382   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1383   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1384   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1385 
1386   // Put the COO struct in a container and then attach that to the matrix
1387   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1388   PetscFunctionReturn(PETSC_SUCCESS);
1389 }
1390 
1391 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1392 {
1393   MatScalarKokkosView        Aa;
1394   ConstMatScalarKokkosView   kv;
1395   PetscMemType               memtype;
1396   PetscContainer             container;
1397   MatCOOStruct_SeqAIJKokkos *coo;
1398 
1399   PetscFunctionBegin;
1400   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1401   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1402 
1403   const auto &n    = coo->n;
1404   const auto &Annz = coo->nz;
1405   const auto &jmap = coo->jmap;
1406   const auto &perm = coo->perm;
1407 
1408   PetscCall(PetscGetMemType(v, &memtype));
1409   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1410     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1411   } else {
1412     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1413   }
1414 
1415   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1416   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1417 
1418   PetscCall(PetscLogGpuTimeBegin());
1419   Kokkos::parallel_for(
1420     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1421       PetscScalar sum = 0.0;
1422       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1423       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1424     });
1425   PetscCall(PetscLogGpuTimeEnd());
1426 
1427   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1428   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1429   PetscFunctionReturn(PETSC_SUCCESS);
1430 }
1431 
1432 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1433 {
1434   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1435 
1436   PetscFunctionBegin;
1437   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1438   A->boundtocpu  = PETSC_FALSE;
1439 
1440   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1441   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1442   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1443   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1444   A->ops->scale                     = MatScale_SeqAIJKokkos;
1445   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1446   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1447   A->ops->mult                      = MatMult_SeqAIJKokkos;
1448   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1449   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1450   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1451   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1452   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1453   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1454   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1455   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1456   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1457   A->ops->shift                     = MatShift_SeqAIJKokkos;
1458   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1459   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1460   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1461   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1462   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1463   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1464   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1465   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1466   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1467 
1468   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1469   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1470 #if defined(PETSC_HAVE_HYPRE)
1471   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1472 #endif
1473   PetscFunctionReturn(PETSC_SUCCESS);
1474 }
1475 
1476 /*
1477    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1478 
1479   Input Parameters:
1480 +  A       - the MATSEQAIJKOKKOS matrix
1481 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1482 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1483 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1484 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1485 
1486   Output Parameter:
1487 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1488 */
1489 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1490 {
1491   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1492   PetscInt          nblocks = bs.extent(0) - 1;
1493 
1494   PetscFunctionBegin;
1495   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1496 
1497   // Pull out the diagonal blocks of the matrix and then invert the blocks
1498   auto Aa    = akok->a_dual.view_device();
1499   auto Ai    = akok->i_dual.view_device();
1500   auto Aj    = akok->j_dual.view_device();
1501   auto Adiag = akok->diag_dual.view_device();
1502   // TODO: how to tune the team size?
1503 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1504   auto ts = Kokkos::AUTO();
1505 #else
1506   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1507 #endif
1508   PetscCallCXX(Kokkos::parallel_for(
1509     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1510       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1511       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1512       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1513       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1514       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1515 
1516       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1517         PetscInt i = rstart + r;                                                            // i-th row in A
1518 
1519         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1520           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1521 
1522           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1523             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1524               B(r, c) = 0.0;
1525             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1526               B(r, c) = Aa(first + c);
1527             } else { // this entry does not show up in the CSR
1528               B(r, c) = 0.0;
1529             }
1530           }
1531         } else { // rare case that the diagonal does not exist
1532           const PetscInt begin = Ai(i);
1533           const PetscInt end   = Ai(i + 1);
1534           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1535           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1536             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1537             else if (Aj(j) >= rstart + m) break;
1538           }
1539         }
1540       });
1541 
1542       // LU-decompose B (w/o pivoting) and then invert B
1543       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1544       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1545     }));
1546   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1547   PetscFunctionReturn(PETSC_SUCCESS);
1548 }
1549 
1550 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1551 {
1552   Mat_SeqAIJ *aseq;
1553   PetscInt    i, m, n;
1554   auto        exec = PetscGetKokkosExecutionSpace();
1555 
1556   PetscFunctionBegin;
1557   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1558 
1559   m = akok->nrows();
1560   n = akok->ncols();
1561   PetscCall(MatSetSizes(A, m, n, m, n));
1562   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1563 
1564   /* Set up data structures of A as a MATSEQAIJ */
1565   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1566   aseq = (Mat_SeqAIJ *)A->data;
1567 
1568   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1569   PetscCallCXX(akok->j_dual.sync_host(exec));
1570   PetscCallCXX(exec.fence());
1571 
1572   aseq->i       = akok->i_host_data();
1573   aseq->j       = akok->j_host_data();
1574   aseq->a       = akok->a_host_data();
1575   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1576   aseq->free_a  = PETSC_FALSE;
1577   aseq->free_ij = PETSC_FALSE;
1578   aseq->nz      = akok->nnz();
1579   aseq->maxnz   = aseq->nz;
1580 
1581   PetscCall(PetscMalloc1(m, &aseq->imax));
1582   PetscCall(PetscMalloc1(m, &aseq->ilen));
1583   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1584 
1585   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1586   akok->nonzerostate = A->nonzerostate;
1587   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1588   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1589   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1590   PetscFunctionReturn(PETSC_SUCCESS);
1591 }
1592 
1593 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1594 {
1595   PetscFunctionBegin;
1596   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1597   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1598   PetscFunctionReturn(PETSC_SUCCESS);
1599 }
1600 
1601 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1602 {
1603   Mat_SeqAIJKokkos *akok;
1604 
1605   PetscFunctionBegin;
1606   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1607   PetscCall(MatCreate(comm, A));
1608   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1609   PetscFunctionReturn(PETSC_SUCCESS);
1610 }
1611 
1612 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1613 
1614    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1615  */
1616 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1617 {
1618   PetscFunctionBegin;
1619   PetscCall(MatCreate(comm, A));
1620   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1621   PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623 
1624 /*@C
1625   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1626   (the default parallel PETSc format). This matrix will ultimately be handled by
1627   Kokkos for calculations.
1628 
1629   Collective
1630 
1631   Input Parameters:
1632 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1633 . m    - number of rows
1634 . n    - number of columns
1635 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1636 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1637 
1638   Output Parameter:
1639 . A - the matrix
1640 
1641   Level: intermediate
1642 
1643   Notes:
1644   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1645   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1646   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1647 
1648   The AIJ format, also called
1649   compressed row storage, is fully compatible with standard Fortran
1650   storage.  That is, the stored row and column indices can begin at
1651   either one (as in Fortran) or zero.
1652 
1653   Specify the preallocated storage with either `nz` or `nnz` (not both).
1654   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1655   allocation.
1656 
1657 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1658 @*/
1659 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1660 {
1661   PetscFunctionBegin;
1662   PetscCall(PetscKokkosInitializeCheck());
1663   PetscCall(MatCreate(comm, A));
1664   PetscCall(MatSetSizes(*A, m, n, m, n));
1665   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1666   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1667   PetscFunctionReturn(PETSC_SUCCESS);
1668 }
1669 
1670 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1671 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1672 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1673 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1674 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1675 {
1676   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1677   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1678   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1679 
1680   PetscFunctionBegin;
1681   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1682     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1683     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1684     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1685   }
1686   PetscFunctionReturn(PETSC_SUCCESS);
1687 }
1688 
1689 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1690 {
1691   const PetscInt              n         = A->rmap->n;
1692   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1693   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1694   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1695 
1696   PetscFunctionBegin;
1697   if (!factors->transpose_updated) {
1698     if (has_upper) {
1699       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1700         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1701         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1702         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1703       }
1704 
1705       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1706         if (!factors->U) {
1707           Mat_SeqAIJ *seq;
1708 
1709           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1710           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1711 
1712           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1713           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1714           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1715           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1716         } else {
1717           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1718         }
1719         // Copy Ut from host to device
1720         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1721         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1722         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1723       } else { // If U was computed on device, we also compute the transpose there
1724         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1725         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1726                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1727                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1728                                                                                                                                                                                                                                factors->aUt_d));
1729         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1730       }
1731       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1732     }
1733 
1734     // do the same for L with LU
1735     if (has_lower) {
1736       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1737         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1738         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1739         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1740       }
1741 
1742       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1743         if (!factors->L) {
1744           Mat_SeqAIJ *seq;
1745 
1746           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1747           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1748 
1749           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1750           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1751           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1752           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1753         } else {
1754           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1755         }
1756         // Copy Lt from host to device
1757         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1758         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1759         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1760       } else { // If L was computed on device, we also compute the transpose there
1761         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1762         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1763                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1764                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1765                                                                                                                                                                                                                                factors->aLt_d));
1766         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1767       }
1768       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1769     }
1770 
1771     factors->transpose_updated = PETSC_TRUE;
1772   }
1773   PetscFunctionReturn(PETSC_SUCCESS);
1774 }
1775 
1776 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1777 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1778 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1779 {
1780   auto                        exec    = PetscGetKokkosExecutionSpace();
1781   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1782   PetscInt                    m       = A->rmap->n;
1783   PetscScalarKokkosView       D       = factors->D_d;
1784   PetscScalarKokkosView       X, Y, B; // alias
1785   ConstPetscScalarKokkosView  b;
1786   PetscScalarKokkosView       x;
1787   PetscIntKokkosView         &rowperm  = factors->rowperm;
1788   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1789 
1790   PetscFunctionBegin;
1791   PetscCall(PetscLogGpuTimeBegin());
1792   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1793   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1794   PetscCall(VecGetKokkosView(bb, &b));
1795   PetscCall(VecGetKokkosViewWrite(xx, &x));
1796 
1797   // Solve U^T Y = B
1798   if (identity) { // Reorder b with the row permutation
1799     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1800     Y = factors->workVector;
1801   } else {
1802     B = factors->workVector;
1803     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1804     Y = x;
1805   }
1806   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1807 
1808   // Solve diag(D) Y' = Y.
1809   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1810   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1811 
1812   // Solve UX = Y
1813   if (identity) {
1814     X = x;
1815   } else {
1816     X = factors->workVector; // B is not needed anymore
1817   }
1818   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1819 
1820   // Reorder X with the inverse column (row) permutation
1821   if (!identity) {
1822     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1823   }
1824 
1825   PetscCall(VecRestoreKokkosView(bb, &b));
1826   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1827   PetscCall(PetscLogGpuTimeEnd());
1828   PetscFunctionReturn(PETSC_SUCCESS);
1829 }
1830 
1831 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1832 // R and C are represented by rowperm and colperm in factors.
1833 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1834 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1835 {
1836   auto                        exec    = PetscGetKokkosExecutionSpace();
1837   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1838   PetscInt                    m       = A->rmap->n;
1839   PetscScalarKokkosView       X, Y, B; // alias
1840   ConstPetscScalarKokkosView  b;
1841   PetscScalarKokkosView       x;
1842   PetscIntKokkosView         &rowperm      = factors->rowperm;
1843   PetscIntKokkosView         &colperm      = factors->colperm;
1844   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1845   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1846 
1847   PetscFunctionBegin;
1848   PetscCall(PetscLogGpuTimeBegin());
1849   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1850   PetscCall(VecGetKokkosView(bb, &b));
1851   PetscCall(VecGetKokkosViewWrite(xx, &x));
1852 
1853   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1854   if (row_identity) {
1855     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1856     Y = factors->workVector;
1857   } else {
1858     B = factors->workVector;
1859     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1860     Y = x;
1861   }
1862   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1863 
1864   // Solve U C^- x = Y
1865   if (col_identity) {
1866     X = x;
1867   } else {
1868     X = factors->workVector;
1869   }
1870   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1871 
1872   // x = C X; Reorder X with the inverse col permutation
1873   if (!col_identity) {
1874     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1875   }
1876 
1877   PetscCall(VecRestoreKokkosView(bb, &b));
1878   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1879   PetscCall(PetscLogGpuTimeEnd());
1880   PetscFunctionReturn(PETSC_SUCCESS);
1881 }
1882 
1883 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1884 // R and C are represented by rowperm and colperm in factors.
1885 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1886 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1887 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1888 {
1889   auto                        exec    = PetscGetKokkosExecutionSpace();
1890   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1891   PetscInt                    m       = A->rmap->n;
1892   PetscScalarKokkosView       X, Y, B; // alias
1893   ConstPetscScalarKokkosView  b;
1894   PetscScalarKokkosView       x;
1895   PetscIntKokkosView         &rowperm      = factors->rowperm;
1896   PetscIntKokkosView         &colperm      = factors->colperm;
1897   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1898   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1899 
1900   PetscFunctionBegin;
1901   PetscCall(PetscLogGpuTimeBegin());
1902   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1903   PetscCall(VecGetKokkosView(bb, &b));
1904   PetscCall(VecGetKokkosViewWrite(xx, &x));
1905 
1906   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1907   if (col_identity) { // Reorder b with the col permutation
1908     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1909     Y = factors->workVector;
1910   } else {
1911     B = factors->workVector;
1912     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1913     Y = x;
1914   }
1915   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1916 
1917   // Solve L^T X = Y
1918   if (row_identity) {
1919     X = x;
1920   } else {
1921     X = factors->workVector;
1922   }
1923   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1924 
1925   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1926   if (!row_identity) {
1927     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1928   }
1929 
1930   PetscCall(VecRestoreKokkosView(bb, &b));
1931   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1932   PetscCall(PetscLogGpuTimeEnd());
1933   PetscFunctionReturn(PETSC_SUCCESS);
1934 }
1935 
1936 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1937 {
1938   PetscFunctionBegin;
1939   PetscCall(MatSeqAIJKokkosSyncHost(A));
1940   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1941 
1942   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1943     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1944     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1945     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1946     const MatScalar            *Ba = b->a;
1947     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1948 
1949     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1950       // Allocate memory and copy the structure
1951       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1952       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1953       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1954       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1955       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1956       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1957 
1958       PetscInt *Li = factors->iL_h.data();
1959       PetscInt *Lj = factors->jL_h.data();
1960       PetscInt *Ui = factors->iU_h.data();
1961       PetscInt *Uj = factors->jU_h.data();
1962 
1963       Li[0] = Ui[0] = 0;
1964       for (PetscInt i = 0; i < m; i++) {
1965         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1966         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1967 
1968         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1969         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1970 
1971         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1972         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1973 
1974         Li[i + 1] = Li[i] + llen + 1;
1975         Ui[i + 1] = Ui[i] + ulen;
1976       }
1977 
1978       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1979       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1980       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1981       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1982       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1983       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1984 
1985       // Copy row/col permutation to device
1986       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1987       PetscBool row_identity;
1988       PetscCall(ISIdentity(rowperm, &row_identity));
1989       if (!row_identity) {
1990         const PetscInt *ip;
1991 
1992         PetscCall(ISGetIndices(rowperm, &ip));
1993         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1994         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1995         PetscCall(ISRestoreIndices(rowperm, &ip));
1996         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1997       }
1998 
1999       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
2000       PetscBool col_identity;
2001       PetscCall(ISIdentity(colperm, &col_identity));
2002       if (!col_identity) {
2003         const PetscInt *ip;
2004 
2005         PetscCall(ISGetIndices(colperm, &ip));
2006         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2007         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2008         PetscCall(ISRestoreIndices(colperm, &ip));
2009         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2010       }
2011 
2012       /* Create sptrsv handles for L, U and their transpose */
2013 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2014       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2015 #else
2016       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2017 #endif
2018       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2019       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2020       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2021       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2022     }
2023 
2024     // Copy the value
2025     for (PetscInt i = 0; i < m; i++) {
2026       PetscInt        llen = Bi[i + 1] - Bi[i];
2027       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2028       const PetscInt *Li   = factors->iL_h.data();
2029       const PetscInt *Ui   = factors->iU_h.data();
2030 
2031       PetscScalar *La = factors->aL_h.data();
2032       PetscScalar *Ua = factors->aU_h.data();
2033 
2034       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2035       La[Li[i] + llen] = 1.0;                      // diagonal entry
2036 
2037       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2038       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2039     }
2040 
2041     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2042     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2043     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2044     factors->transpose_updated         = PETSC_FALSE;
2045     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2046 
2047     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2048     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2049   }
2050 
2051   B->ops->matsolve          = NULL;
2052   B->ops->matsolvetranspose = NULL;
2053   PetscFunctionReturn(PETSC_SUCCESS);
2054 }
2055 
2056 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2057 {
2058   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2059   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2060   PetscInt                    fill_lev = info->levels;
2061 
2062   PetscFunctionBegin;
2063   PetscCall(PetscLogGpuTimeBegin());
2064   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2065   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2066 
2067   auto a_d = aijkok->a_dual.view_device();
2068   auto i_d = aijkok->i_dual.view_device();
2069   auto j_d = aijkok->j_dual.view_device();
2070 
2071   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2072 
2073   B->assembled              = PETSC_TRUE;
2074   B->preallocated           = PETSC_TRUE;
2075   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2076   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2077   B->ops->matsolve          = NULL;
2078   B->ops->matsolvetranspose = NULL;
2079 
2080   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2081   factors->transpose_updated         = PETSC_FALSE;
2082   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2083   /* TODO: log flops, but how to know that? */
2084   PetscCall(PetscLogGpuTimeEnd());
2085   PetscFunctionReturn(PETSC_SUCCESS);
2086 }
2087 
2088 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2089 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2090 {
2091   Mat_SeqAIJKokkos           *aijkok;
2092   Mat_SeqAIJ                 *b;
2093   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2094   PetscInt                    fill_lev = info->levels;
2095   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2096   PetscInt                    n        = A->rmap->n;
2097 
2098   PetscFunctionBegin;
2099   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2100   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2101 
2102   /* Create a spiluk handle and then do symbolic factorization */
2103   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2104   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2105 
2106   auto spiluk_handle = factors->kh.get_spiluk_handle();
2107 
2108   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2109   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2110   Kokkos::realloc(factors->iU_d, n + 1);
2111   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2112 
2113   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2114   auto i_d = aijkok->i_dual.view_device();
2115   auto j_d = aijkok->j_dual.view_device();
2116   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2117   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2118 
2119   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2120   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2121   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2122   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2123 
2124   /* TODO: add options to select sptrsv algorithms */
2125   /* Create sptrsv handles for L, U and their transpose */
2126 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2127   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2128 #else
2129   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2130 #endif
2131 
2132   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2133   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2134   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2135   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2136 
2137   /* Fill fields of the factor matrix B */
2138   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2139   b     = (Mat_SeqAIJ *)B->data;
2140   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2141   B->info.fill_ratio_given  = info->fill;
2142   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2143 
2144   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2145   PetscFunctionReturn(PETSC_SUCCESS);
2146 }
2147 
2148 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2149 {
2150   PetscFunctionBegin;
2151   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2152   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2153   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2154   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2155   PetscFunctionReturn(PETSC_SUCCESS);
2156 }
2157 
2158 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2159 {
2160   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2161 
2162   PetscFunctionBegin;
2163   if (!info->factoronhost) {
2164     PetscCall(ISIdentity(isrow, &row_identity));
2165     PetscCall(ISIdentity(iscol, &col_identity));
2166   }
2167 
2168   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2169   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2170 
2171   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2172     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2173   } else {
2174     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2175     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2176   }
2177   PetscFunctionReturn(PETSC_SUCCESS);
2178 }
2179 
2180 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2181 {
2182   PetscFunctionBegin;
2183   PetscCall(MatSeqAIJKokkosSyncHost(A));
2184   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2185 
2186   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2187     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2188     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2189     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2190     const MatScalar            *Ba = b->a;
2191     PetscInt                    m  = B->rmap->n;
2192 
2193     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2194       // Allocate memory and copy the structure
2195       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2196       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2197       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2198       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2199       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2200       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2201 
2202       // Build jU_h from the skewed Aj
2203       PetscInt *Uj = factors->jU_h.data();
2204       for (PetscInt i = 0; i < m; i++) {
2205         PetscInt ulen = Bi[i + 1] - Bi[i];
2206         Uj[Bi[i]]     = i;                                              // diagonal entry
2207         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2208       }
2209 
2210       // Copy iU, jU to device
2211       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2212       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2213 
2214       // Copy row/col permutation to device
2215       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2216       PetscBool row_identity;
2217       PetscCall(ISIdentity(rowperm, &row_identity));
2218       if (!row_identity) {
2219         const PetscInt *ip;
2220 
2221         PetscCall(ISGetIndices(rowperm, &ip));
2222         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2223         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2224         PetscCall(ISRestoreIndices(rowperm, &ip));
2225         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2226       }
2227 
2228       // Create sptrsv handles for U and U^T
2229 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2230       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2231 #else
2232       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2233 #endif
2234       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2235       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2236     }
2237     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2238     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2239     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2240 
2241     // Copy the value
2242     PetscScalar *Ua = factors->aU_h.data();
2243     PetscScalar *D  = factors->D_h.data();
2244     for (PetscInt i = 0; i < m; i++) {
2245       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2246       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2247       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2248     }
2249     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2250     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2251 
2252     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2253     factors->transpose_updated         = PETSC_FALSE;
2254   }
2255 
2256   B->ops->matsolve          = NULL;
2257   B->ops->matsolvetranspose = NULL;
2258   PetscFunctionReturn(PETSC_SUCCESS);
2259 }
2260 
2261 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2262 {
2263   PetscFunctionBegin;
2264   if (info->solveonhost) {
2265     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2266     PetscCall(MatSetType(B, MATSEQSBAIJ));
2267     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2268   }
2269 
2270   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2271 
2272   if (!info->solveonhost) {
2273     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2274     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2275     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2276     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2277   }
2278   PetscFunctionReturn(PETSC_SUCCESS);
2279 }
2280 
2281 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2282 {
2283   PetscFunctionBegin;
2284   if (info->solveonhost) {
2285     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2286     PetscCall(MatSetType(B, MATSEQSBAIJ));
2287     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2288   }
2289 
2290   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2291 
2292   if (!info->solveonhost) {
2293     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2294     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2295     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2296     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2297   }
2298   PetscFunctionReturn(PETSC_SUCCESS);
2299 }
2300 
2301 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2302 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2303 {
2304   PetscFunctionBegin;
2305   *type = MATSOLVERKOKKOS;
2306   PetscFunctionReturn(PETSC_SUCCESS);
2307 }
2308 
2309 /*MC
2310   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2311   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2312 
2313   Level: beginner
2314 
2315 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2316 M*/
2317 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2318 {
2319   PetscInt n = A->rmap->n;
2320   MPI_Comm comm;
2321 
2322   PetscFunctionBegin;
2323   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2324   PetscCall(MatCreate(comm, B));
2325   PetscCall(MatSetSizes(*B, n, n, n, n));
2326   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2327   (*B)->factortype = ftype;
2328   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2329   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2330   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2331 
2332   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2333     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2334     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2335     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2336     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2337     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2338   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2339     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2340     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2341     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2342     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2343   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2344 
2345   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2346   (*B)->canuseordering = PETSC_TRUE;
2347   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2348   PetscFunctionReturn(PETSC_SUCCESS);
2349 }
2350 
2351 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2352 {
2353   PetscFunctionBegin;
2354   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2355   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2356   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2357   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2358   PetscFunctionReturn(PETSC_SUCCESS);
2359 }
2360 
2361 /* Utility to print out a KokkosCsrMatrix for debugging */
2362 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2363 {
2364   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2365   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2366   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2367   const PetscInt    *i  = iv.data();
2368   const PetscInt    *j  = jv.data();
2369   const PetscScalar *a  = av.data();
2370   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2371 
2372   PetscFunctionBegin;
2373   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2374   for (PetscInt k = 0; k < m; k++) {
2375     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2376     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2377     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2378   }
2379   PetscFunctionReturn(PETSC_SUCCESS);
2380 }
2381