xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 8d8740d487f85fbc55ead29661dd8ca8eb31cb0c)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20 #include <KokkosSparse_spmv.hpp>
21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22 
23 #include <KokkosSparse_spiluk.hpp>
24 #include <KokkosSparse_sptrsv.hpp>
25 #include <KokkosSparse_spgemm.hpp>
26 #include <KokkosSparse_spadd.hpp>
27 #include <KokkosBatched_LU_Decl.hpp>
28 #include <KokkosBatched_InverseLU_Decl.hpp>
29 
30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31 
32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33   #include <KokkosSparse_Utils.hpp>
34 using KokkosSparse::sort_crs_matrix;
35 using KokkosSparse::Impl::transpose_matrix;
36 #else
37   #include <KokkosKernels_Sorting.hpp>
38 using KokkosKernels::sort_crs_matrix;
39 using KokkosKernels::Impl::transpose_matrix;
40 #endif
41 
42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43 using KokkosSparse::spiluk_symbolic;
44 using KokkosSparse::spiluk_numeric;
45 using KokkosSparse::sptrsv_symbolic;
46 using KokkosSparse::sptrsv_solve;
47 using KokkosSparse::Experimental::SPTRSVAlgorithm;
48 using KokkosSparse::Experimental::SPILUKAlgorithm;
49 #else
50 using KokkosSparse::Experimental::spiluk_symbolic;
51 using KokkosSparse::Experimental::spiluk_numeric;
52 using KokkosSparse::Experimental::sptrsv_symbolic;
53 using KokkosSparse::Experimental::sptrsv_solve;
54 using KokkosSparse::Experimental::SPTRSVAlgorithm;
55 using KokkosSparse::Experimental::SPILUKAlgorithm;
56 #endif
57 
58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59 
60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62    In the latter case, it is important to set a_dual's sync state correctly.
63  */
64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65 {
66   Mat_SeqAIJ       *aijseq;
67   Mat_SeqAIJKokkos *aijkok;
68 
69   PetscFunctionBegin;
70   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72 
73   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75 
76   /* If aijkok does not exist, we just copy i, j to device.
77      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78      In both cases, we build a new aijkok structure.
79   */
80   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81     delete aijkok;
82     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
83     A->spptr = aijkok;
84   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
85     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
86     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
87     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
88   }
89   PetscFunctionReturn(PETSC_SUCCESS);
90 }
91 
92 /* Sync CSR data to device if not yet */
93 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
94 {
95   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
96 
97   PetscFunctionBegin;
98   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
99   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
100   if (aijkok->a_dual.need_sync_device()) {
101     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
102     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
103     aijkok->hermitian_updated = PETSC_FALSE;
104   }
105   PetscFunctionReturn(PETSC_SUCCESS);
106 }
107 
108 /* Mark the CSR data on device as modified */
109 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
110 {
111   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
112 
113   PetscFunctionBegin;
114   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
115   aijkok->a_dual.clear_sync_state();
116   aijkok->a_dual.modify_device();
117   aijkok->transpose_updated = PETSC_FALSE;
118   aijkok->hermitian_updated = PETSC_FALSE;
119   PetscCall(MatSeqAIJInvalidateDiagonal(A));
120   PetscCall(PetscObjectStateIncrease((PetscObject)A));
121   PetscFunctionReturn(PETSC_SUCCESS);
122 }
123 
124 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127   auto              exec   = PetscGetKokkosExecutionSpace();
128 
129   PetscFunctionBegin;
130   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
131   /* We do not expect one needs factors on host  */
132   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
133   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
134   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
135   PetscFunctionReturn(PETSC_SUCCESS);
136 }
137 
138 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
139 {
140   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
141 
142   PetscFunctionBegin;
143   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
144     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
145     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
146     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
147   */
148   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
150     *array = aijkok->a_dual.view_host().data();
151   } else { /* Happens when calling MatSetValues on a newly created matrix */
152     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
153   }
154   PetscFunctionReturn(PETSC_SUCCESS);
155 }
156 
157 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
158 {
159   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
160 
161   PetscFunctionBegin;
162   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
163   PetscFunctionReturn(PETSC_SUCCESS);
164 }
165 
166 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
167 {
168   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
169 
170   PetscFunctionBegin;
171   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
172     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
173     *array = aijkok->a_dual.view_host().data();
174   } else {
175     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
176   }
177   PetscFunctionReturn(PETSC_SUCCESS);
178 }
179 
180 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
181 {
182   PetscFunctionBegin;
183   *array = NULL;
184   PetscFunctionReturn(PETSC_SUCCESS);
185 }
186 
187 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
188 {
189   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
190 
191   PetscFunctionBegin;
192   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
193     *array = aijkok->a_dual.view_host().data();
194   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
195     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
196   }
197   PetscFunctionReturn(PETSC_SUCCESS);
198 }
199 
200 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
201 {
202   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
203 
204   PetscFunctionBegin;
205   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
206     aijkok->a_dual.clear_sync_state();
207     aijkok->a_dual.modify_host();
208   }
209   PetscFunctionReturn(PETSC_SUCCESS);
210 }
211 
212 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
213 {
214   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
215 
216   PetscFunctionBegin;
217   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
218 
219   if (i) *i = aijkok->i_device_data();
220   if (j) *j = aijkok->j_device_data();
221   if (a) {
222     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
223     *a = aijkok->a_device_data();
224   }
225   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
226   PetscFunctionReturn(PETSC_SUCCESS);
227 }
228 
229 /*
230   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
231 
232   Input Parameter:
233 .  A       - the MATSEQAIJKOKKOS matrix
234 
235   Output Parameters:
236 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
237 -  T_d    - the transpose on device, whose value array is allocated but not initialized
238 */
239 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
240 {
241   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
242   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
243   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
244   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
245   MatRowMapType          *Ti = Ti_h.data();
246   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
247   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
248   PetscInt               *Tj   = Tj_h.data();
249   PetscInt               *perm = perm_h.data();
250   PetscInt               *offset;
251 
252   PetscFunctionBegin;
253   // Populate Ti
254   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
255   Ti++;
256   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
257   Ti--;
258   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
259 
260   // Populate Tj and the permutation array
261   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
262   for (PetscInt i = 0; i < m; i++) {
263     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
264       PetscInt r    = Aj[j];                       // row r of T
265       PetscInt disp = Ti[r] + offset[r];
266 
267       Tj[disp]   = i; // col i of T
268       perm[disp] = j;
269       offset[r]++;
270     }
271   }
272   PetscCall(PetscFree(offset));
273 
274   // Sort each row of T, along with the permutation array
275   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
276 
277   // Output perm and T on device
278   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
279   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
280   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
281   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
282   PetscFunctionReturn(PETSC_SUCCESS);
283 }
284 
285 // Generate the transpose on device and cache it internally
286 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
287 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
288 {
289   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
290   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
291   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
292   KokkosCsrMatrix  &T = akok->csrmatT;
293 
294   PetscFunctionBegin;
295   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
296   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
297 
298   const auto &Aa = akok->a_dual.view_device();
299 
300   if (A->symmetric == PETSC_BOOL3_TRUE) {
301     *csrmatT = akok->csrmat;
302   } else {
303     // See if we already have a cached transpose and its value is up to date
304     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
305       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
306         const auto &perm = akok->transpose_perm; // get the permutation array
307         auto       &Ta   = T.values;
308 
309         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
310       }
311     } else { // Generate T of size n x m for the first time
312       MatRowMapKokkosView perm;
313 
314       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
315       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
316       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
317     }
318     akok->transpose_updated = PETSC_TRUE;
319     *csrmatT                = akok->csrmatT;
320   }
321   PetscFunctionReturn(PETSC_SUCCESS);
322 }
323 
324 // Generate the Hermitian on device and cache it internally
325 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
326 {
327   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
328   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
329   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
330   KokkosCsrMatrix  &T = akok->csrmatH;
331 
332   PetscFunctionBegin;
333   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
334   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
335 
336   const auto &Aa = akok->a_dual.view_device();
337 
338   if (A->hermitian == PETSC_BOOL3_TRUE) {
339     *csrmatH = akok->csrmat;
340   } else {
341     // See if we already have a cached hermitian and its value is up to date
342     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
343       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
344         const auto &perm = akok->transpose_perm; // get the permutation array
345         auto       &Ta   = T.values;
346 
347         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
348       }
349     } else { // Generate T of size n x m for the first time
350       MatRowMapKokkosView perm;
351 
352       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
353       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
354       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
355     }
356     akok->hermitian_updated = PETSC_TRUE;
357     *csrmatH                = akok->csrmatH;
358   }
359   PetscFunctionReturn(PETSC_SUCCESS);
360 }
361 
362 /* y = A x */
363 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
364 {
365   Mat_SeqAIJKokkos          *aijkok;
366   ConstPetscScalarKokkosView xv;
367   PetscScalarKokkosView      yv;
368 
369   PetscFunctionBegin;
370   PetscCall(PetscLogGpuTimeBegin());
371   PetscCall(MatSeqAIJKokkosSyncDevice(A));
372   PetscCall(VecGetKokkosView(xx, &xv));
373   PetscCall(VecGetKokkosViewWrite(yy, &yv));
374   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
375   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
376   PetscCall(VecRestoreKokkosView(xx, &xv));
377   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
378   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
379   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
380   PetscCall(PetscLogGpuTimeEnd());
381   PetscFunctionReturn(PETSC_SUCCESS);
382 }
383 
384 /* y = A^T x */
385 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
386 {
387   Mat_SeqAIJKokkos          *aijkok;
388   const char                *mode;
389   ConstPetscScalarKokkosView xv;
390   PetscScalarKokkosView      yv;
391   KokkosCsrMatrix            csrmat;
392 
393   PetscFunctionBegin;
394   PetscCall(PetscLogGpuTimeBegin());
395   PetscCall(MatSeqAIJKokkosSyncDevice(A));
396   PetscCall(VecGetKokkosView(xx, &xv));
397   PetscCall(VecGetKokkosViewWrite(yy, &yv));
398   if (A->form_explicit_transpose) {
399     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
400     mode = "N";
401   } else {
402     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
403     csrmat = aijkok->csrmat;
404     mode   = "T";
405   }
406   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
407   PetscCall(VecRestoreKokkosView(xx, &xv));
408   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
409   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
410   PetscCall(PetscLogGpuTimeEnd());
411   PetscFunctionReturn(PETSC_SUCCESS);
412 }
413 
414 /* y = A^H x */
415 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
416 {
417   Mat_SeqAIJKokkos          *aijkok;
418   const char                *mode;
419   ConstPetscScalarKokkosView xv;
420   PetscScalarKokkosView      yv;
421   KokkosCsrMatrix            csrmat;
422 
423   PetscFunctionBegin;
424   PetscCall(PetscLogGpuTimeBegin());
425   PetscCall(MatSeqAIJKokkosSyncDevice(A));
426   PetscCall(VecGetKokkosView(xx, &xv));
427   PetscCall(VecGetKokkosViewWrite(yy, &yv));
428   if (A->form_explicit_transpose) {
429     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
430     mode = "N";
431   } else {
432     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
433     csrmat = aijkok->csrmat;
434     mode   = "C";
435   }
436   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
437   PetscCall(VecRestoreKokkosView(xx, &xv));
438   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
439   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
440   PetscCall(PetscLogGpuTimeEnd());
441   PetscFunctionReturn(PETSC_SUCCESS);
442 }
443 
444 /* z = A x + y */
445 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
446 {
447   Mat_SeqAIJKokkos          *aijkok;
448   ConstPetscScalarKokkosView xv;
449   PetscScalarKokkosView      zv;
450 
451   PetscFunctionBegin;
452   PetscCall(PetscLogGpuTimeBegin());
453   PetscCall(MatSeqAIJKokkosSyncDevice(A));
454   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
455   PetscCall(VecGetKokkosView(xx, &xv));
456   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
457   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
458   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
459   PetscCall(VecRestoreKokkosView(xx, &xv));
460   PetscCall(VecRestoreKokkosView(zz, &zv));
461   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
462   PetscCall(PetscLogGpuTimeEnd());
463   PetscFunctionReturn(PETSC_SUCCESS);
464 }
465 
466 /* z = A^T x + y */
467 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
468 {
469   Mat_SeqAIJKokkos          *aijkok;
470   const char                *mode;
471   ConstPetscScalarKokkosView xv;
472   PetscScalarKokkosView      zv;
473   KokkosCsrMatrix            csrmat;
474 
475   PetscFunctionBegin;
476   PetscCall(PetscLogGpuTimeBegin());
477   PetscCall(MatSeqAIJKokkosSyncDevice(A));
478   if (zz != yy) PetscCall(VecCopy(yy, zz));
479   PetscCall(VecGetKokkosView(xx, &xv));
480   PetscCall(VecGetKokkosView(zz, &zv));
481   if (A->form_explicit_transpose) {
482     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
483     mode = "N";
484   } else {
485     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
486     csrmat = aijkok->csrmat;
487     mode   = "T";
488   }
489   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
490   PetscCall(VecRestoreKokkosView(xx, &xv));
491   PetscCall(VecRestoreKokkosView(zz, &zv));
492   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
493   PetscCall(PetscLogGpuTimeEnd());
494   PetscFunctionReturn(PETSC_SUCCESS);
495 }
496 
497 /* z = A^H x + y */
498 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
499 {
500   Mat_SeqAIJKokkos          *aijkok;
501   const char                *mode;
502   ConstPetscScalarKokkosView xv;
503   PetscScalarKokkosView      zv;
504   KokkosCsrMatrix            csrmat;
505 
506   PetscFunctionBegin;
507   PetscCall(PetscLogGpuTimeBegin());
508   PetscCall(MatSeqAIJKokkosSyncDevice(A));
509   if (zz != yy) PetscCall(VecCopy(yy, zz));
510   PetscCall(VecGetKokkosView(xx, &xv));
511   PetscCall(VecGetKokkosView(zz, &zv));
512   if (A->form_explicit_transpose) {
513     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
514     mode = "N";
515   } else {
516     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
517     csrmat = aijkok->csrmat;
518     mode   = "C";
519   }
520   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
521   PetscCall(VecRestoreKokkosView(xx, &xv));
522   PetscCall(VecRestoreKokkosView(zz, &zv));
523   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
524   PetscCall(PetscLogGpuTimeEnd());
525   PetscFunctionReturn(PETSC_SUCCESS);
526 }
527 
528 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
529 {
530   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
531 
532   PetscFunctionBegin;
533   switch (op) {
534   case MAT_FORM_EXPLICIT_TRANSPOSE:
535     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
536     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
537     A->form_explicit_transpose = flg;
538     break;
539   default:
540     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
541     break;
542   }
543   PetscFunctionReturn(PETSC_SUCCESS);
544 }
545 
546 /* Depending on reuse, either build a new mat, or use the existing mat */
547 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
548 {
549   Mat_SeqAIJ *aseq;
550 
551   PetscFunctionBegin;
552   PetscCall(PetscKokkosInitializeCheck());
553   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
554     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
555   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
556     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
557   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
558     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
559     PetscCall(PetscFree(A->defaultvectype));
560     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
561     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
562     PetscCall(MatSetOps_SeqAIJKokkos(A));
563     aseq = static_cast<Mat_SeqAIJ *>(A->data);
564     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
565       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
566       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
567     }
568   }
569   PetscFunctionReturn(PETSC_SUCCESS);
570 }
571 
572 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
573    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
574  */
575 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
576 {
577   Mat_SeqAIJ       *bseq;
578   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
579   Mat               mat;
580 
581   PetscFunctionBegin;
582   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
583   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
584   mat = *B;
585   if (A->assembled) {
586     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
587     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
588     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
589     /* Now copy values to B if needed */
590     if (dupOption == MAT_COPY_VALUES) {
591       if (akok->a_dual.need_sync_device()) {
592         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
593         bkok->a_dual.modify_host();
594       } else { /* If device has the latest data, we only copy data on device */
595         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
596         bkok->a_dual.modify_device();
597       }
598     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
599       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
600       bkok->a_dual.modify_host();
601     }
602     mat->spptr = bkok;
603   }
604 
605   PetscCall(PetscFree(mat->defaultvectype));
606   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
607   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
608   PetscCall(MatSetOps_SeqAIJKokkos(mat));
609   PetscFunctionReturn(PETSC_SUCCESS);
610 }
611 
612 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
613 {
614   Mat               At;
615   KokkosCsrMatrix   internT;
616   Mat_SeqAIJKokkos *atkok, *bkok;
617 
618   PetscFunctionBegin;
619   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
620   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
621   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
622     /* Deep copy internT, as we want to isolate the internal transpose */
623     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
624     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
625     if (reuse == MAT_INITIAL_MATRIX) *B = At;
626     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
627   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
628     if ((*B)->assembled) {
629       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
630       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
631       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
632     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
633       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
634       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
635       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
636       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
637       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
638     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
639   }
640   PetscFunctionReturn(PETSC_SUCCESS);
641 }
642 
643 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
644 {
645   Mat_SeqAIJKokkos *aijkok;
646 
647   PetscFunctionBegin;
648   if (A->factortype == MAT_FACTOR_NONE) {
649     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
650     delete aijkok;
651   } else {
652     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
653   }
654   A->spptr = NULL;
655   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
656   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
657   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
658 #if defined(PETSC_HAVE_HYPRE)
659   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
660 #endif
661   PetscCall(MatDestroy_SeqAIJ(A));
662   PetscFunctionReturn(PETSC_SUCCESS);
663 }
664 
665 /*MC
666    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
667 
668    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
669 
670    Options Database Key:
671 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
672 
673   Level: beginner
674 
675 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
676 M*/
677 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
678 {
679   PetscFunctionBegin;
680   PetscCall(PetscKokkosInitializeCheck());
681   PetscCall(MatCreate_SeqAIJ(A));
682   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
683   PetscFunctionReturn(PETSC_SUCCESS);
684 }
685 
686 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
687 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
688 {
689   Mat_SeqAIJ         *a, *b;
690   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
691   MatScalarKokkosView aa, ba, ca;
692   MatRowMapKokkosView ai, bi, ci;
693   MatColIdxKokkosView aj, bj, cj;
694   PetscInt            m, n, nnz, aN;
695 
696   PetscFunctionBegin;
697   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
698   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
699   PetscAssertPointer(C, 4);
700   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
701   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
702   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
703   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
704 
705   PetscCall(MatSeqAIJKokkosSyncDevice(A));
706   PetscCall(MatSeqAIJKokkosSyncDevice(B));
707   a    = static_cast<Mat_SeqAIJ *>(A->data);
708   b    = static_cast<Mat_SeqAIJ *>(B->data);
709   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
710   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
711   aa   = akok->a_dual.view_device();
712   ai   = akok->i_dual.view_device();
713   ba   = bkok->a_dual.view_device();
714   bi   = bkok->i_dual.view_device();
715   m    = A->rmap->n; /* M, N and nnz of C */
716   n    = A->cmap->n + B->cmap->n;
717   nnz  = a->nz + b->nz;
718   aN   = A->cmap->n; /* N of A */
719   if (reuse == MAT_INITIAL_MATRIX) {
720     aj           = akok->j_dual.view_device();
721     bj           = bkok->j_dual.view_device();
722     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
723     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
724     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
725     ca           = ca_dual.view_device();
726     ci           = ci_dual.view_device();
727     cj           = cj_dual.view_device();
728 
729     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
730     Kokkos::parallel_for(
731       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
732         PetscInt i       = t.league_rank(); /* row i */
733         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
734 
735         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
736                                                    ci(i) = coffset;
737                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
738         });
739 
740         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
741           if (k < alen) {
742             ca(coffset + k) = aa(ai(i) + k);
743             cj(coffset + k) = aj(ai(i) + k);
744           } else {
745             ca(coffset + k) = ba(bi(i) + k - alen);
746             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
747           }
748         });
749       });
750     ca_dual.modify_device();
751     ci_dual.modify_device();
752     cj_dual.modify_device();
753     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
754     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
755   } else if (reuse == MAT_REUSE_MATRIX) {
756     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
757     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
758     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
759     ca   = ckok->a_dual.view_device();
760     ci   = ckok->i_dual.view_device();
761 
762     Kokkos::parallel_for(
763       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
764         PetscInt i    = t.league_rank(); /* row i */
765         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
766         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
767           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
768           else ca(ci(i) + k) = ba(bi(i) + k - alen);
769         });
770       });
771     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
772   }
773   PetscFunctionReturn(PETSC_SUCCESS);
774 }
775 
776 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
777 {
778   PetscFunctionBegin;
779   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
780   PetscFunctionReturn(PETSC_SUCCESS);
781 }
782 
783 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
784 {
785   Mat_Product                 *product = C->product;
786   Mat                          A, B;
787   bool                         transA, transB; /* use bool, since KK needs this type */
788   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
789   Mat_SeqAIJ                  *c;
790   MatProductData_SeqAIJKokkos *pdata;
791   KokkosCsrMatrix              csrmatA, csrmatB;
792 
793   PetscFunctionBegin;
794   MatCheckProduct(C, 1);
795   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
796   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
797 
798   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
799   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
800   // we still do numeric.
801   if (pdata->reusesym) { // numeric reuses results from symbolic
802     pdata->reusesym = PETSC_FALSE;
803     PetscFunctionReturn(PETSC_SUCCESS);
804   }
805 
806   switch (product->type) {
807   case MATPRODUCT_AB:
808     transA = false;
809     transB = false;
810     break;
811   case MATPRODUCT_AtB:
812     transA = true;
813     transB = false;
814     break;
815   case MATPRODUCT_ABt:
816     transA = false;
817     transB = true;
818     break;
819   default:
820     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
821   }
822 
823   A = product->A;
824   B = product->B;
825   PetscCall(MatSeqAIJKokkosSyncDevice(A));
826   PetscCall(MatSeqAIJKokkosSyncDevice(B));
827   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
828   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
829   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
830 
831   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
832 
833   csrmatA = akok->csrmat;
834   csrmatB = bkok->csrmat;
835 
836   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
837   if (transA) {
838     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
839     transA = false;
840   }
841 
842   if (transB) {
843     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
844     transB = false;
845   }
846   PetscCall(PetscLogGpuTimeBegin());
847   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
848 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
849   auto spgemmHandle = pdata->kh.get_spgemm_handle();
850   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
851 #endif
852 
853   PetscCall(PetscLogGpuTimeEnd());
854   PetscCall(MatSeqAIJKokkosModifyDevice(C));
855   /* shorter version of MatAssemblyEnd_SeqAIJ */
856   c = (Mat_SeqAIJ *)C->data;
857   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
858   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
859   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
860   c->reallocs         = 0;
861   C->info.mallocs     = 0;
862   C->info.nz_unneeded = 0;
863   C->assembled = C->was_assembled = PETSC_TRUE;
864   C->num_ass++;
865   PetscFunctionReturn(PETSC_SUCCESS);
866 }
867 
868 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
869 {
870   Mat_Product                 *product = C->product;
871   MatProductType               ptype;
872   Mat                          A, B;
873   bool                         transA, transB;
874   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
875   MatProductData_SeqAIJKokkos *pdata;
876   MPI_Comm                     comm;
877   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
878 
879   PetscFunctionBegin;
880   MatCheckProduct(C, 1);
881   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
882   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
883   A = product->A;
884   B = product->B;
885   PetscCall(MatSeqAIJKokkosSyncDevice(A));
886   PetscCall(MatSeqAIJKokkosSyncDevice(B));
887   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
888   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
889   csrmatA = akok->csrmat;
890   csrmatB = bkok->csrmat;
891 
892   ptype = product->type;
893   // Take advantage of the symmetry if true
894   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
895     ptype                                          = MATPRODUCT_AB;
896     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
897   }
898   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
899     ptype                                          = MATPRODUCT_AB;
900     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
901   }
902 
903   switch (ptype) {
904   case MATPRODUCT_AB:
905     transA = false;
906     transB = false;
907     PetscCall(MatSetBlockSizesFromMats(C, A, B));
908     break;
909   case MATPRODUCT_AtB:
910     transA = true;
911     transB = false;
912     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
913     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
914     break;
915   case MATPRODUCT_ABt:
916     transA = false;
917     transB = true;
918     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
919     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
920     break;
921   default:
922     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
923   }
924   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
925   pdata->reusesym = product->api_user;
926 
927   /* TODO: add command line options to select spgemm algorithms */
928   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
929 
930   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
931 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
932   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
933   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
934   #endif
935 #endif
936   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
937 
938   PetscCall(PetscLogGpuTimeBegin());
939   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
940   if (transA) {
941     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
942     transA = false;
943   }
944 
945   if (transB) {
946     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
947     transB = false;
948   }
949 
950   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
951   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
952     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
953     calling new Mat_SeqAIJKokkos().
954     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
955   */
956   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
957 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
958   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
959   auto spgemmHandle = pdata->kh.get_spgemm_handle();
960   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
961 #endif
962   PetscCall(PetscLogGpuTimeEnd());
963 
964   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
965   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
966   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
967   PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 
970 /* handles sparse matrix matrix ops */
971 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
972 {
973   Mat_Product *product = mat->product;
974   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
975 
976   PetscFunctionBegin;
977   MatCheckProduct(mat, 1);
978   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
979   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
980   if (Biskok && Ciskok) {
981     switch (product->type) {
982     case MATPRODUCT_AB:
983     case MATPRODUCT_AtB:
984     case MATPRODUCT_ABt:
985       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
986       break;
987     case MATPRODUCT_PtAP:
988     case MATPRODUCT_RARt:
989     case MATPRODUCT_ABC:
990       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
991       break;
992     default:
993       break;
994     }
995   } else { /* fallback for AIJ */
996     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
997   }
998   PetscFunctionReturn(PETSC_SUCCESS);
999 }
1000 
1001 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1002 {
1003   Mat_SeqAIJKokkos *aijkok;
1004 
1005   PetscFunctionBegin;
1006   PetscCall(PetscLogGpuTimeBegin());
1007   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1008   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1009   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1010   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1011   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1012   PetscCall(PetscLogGpuTimeEnd());
1013   PetscFunctionReturn(PETSC_SUCCESS);
1014 }
1015 
1016 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1017 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1018 {
1019   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1020 
1021   PetscFunctionBegin;
1022   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1023     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1024 
1025     PetscCall(PetscLogGpuTimeBegin());
1026     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1027     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1028     const auto &Aa     = aijkok->a_dual.view_device();
1029     const auto &Adiag  = aijkok->diag_dual.view_device();
1030     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1031     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1032     PetscCall(PetscLogGpuFlops(n));
1033     PetscCall(PetscLogGpuTimeEnd());
1034   } else { // need reassembly, very slow!
1035     PetscCall(MatShift_Basic(A, a));
1036   }
1037   PetscFunctionReturn(PETSC_SUCCESS);
1038 }
1039 
1040 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1041 {
1042   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1043 
1044   PetscFunctionBegin;
1045   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1046     ConstPetscScalarKokkosView dv;
1047     PetscInt                   n, nv;
1048 
1049     PetscCall(PetscLogGpuTimeBegin());
1050     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1051     PetscCall(VecGetKokkosView(D, &dv));
1052     PetscCall(VecGetLocalSize(D, &nv));
1053     n = PetscMin(Y->rmap->n, Y->cmap->n);
1054     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1055 
1056     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1057     const auto &Aa     = aijkok->a_dual.view_device();
1058     const auto &Adiag  = aijkok->diag_dual.view_device();
1059     PetscCallCXX(Kokkos::parallel_for(
1060       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1061         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1062         else Aa(Adiag(i)) += dv(i);
1063       }));
1064     PetscCall(VecRestoreKokkosView(D, &dv));
1065     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1066     PetscCall(PetscLogGpuFlops(n));
1067     PetscCall(PetscLogGpuTimeEnd());
1068   } else { // need reassembly, very slow!
1069     PetscCall(MatDiagonalSet_Default(Y, D, is));
1070   }
1071   PetscFunctionReturn(PETSC_SUCCESS);
1072 }
1073 
1074 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1075 {
1076   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1077   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1078   ConstPetscScalarKokkosView lv, rv;
1079 
1080   PetscFunctionBegin;
1081   PetscCall(PetscLogGpuTimeBegin());
1082   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1083   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1084   const auto &Aa     = aijkok->a_dual.view_device();
1085   const auto &Ai     = aijkok->i_dual.view_device();
1086   const auto &Aj     = aijkok->j_dual.view_device();
1087   if (ll) {
1088     PetscCall(VecGetLocalSize(ll, &m));
1089     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1090     PetscCall(VecGetKokkosView(ll, &lv));
1091     PetscCallCXX(Kokkos::parallel_for( // for each row
1092       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1093         PetscInt i   = t.league_rank(); // row i
1094         PetscInt len = Ai(i + 1) - Ai(i);
1095         // scale entries on the row
1096         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1097       }));
1098     PetscCall(VecRestoreKokkosView(ll, &lv));
1099     PetscCall(PetscLogGpuFlops(nz));
1100   }
1101   if (rr) {
1102     PetscCall(VecGetLocalSize(rr, &n));
1103     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1104     PetscCall(VecGetKokkosView(rr, &rv));
1105     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1106       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1107     PetscCall(VecRestoreKokkosView(rr, &lv));
1108     PetscCall(PetscLogGpuFlops(nz));
1109   }
1110   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1111   PetscCall(PetscLogGpuTimeEnd());
1112   PetscFunctionReturn(PETSC_SUCCESS);
1113 }
1114 
1115 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1116 {
1117   Mat_SeqAIJKokkos *aijkok;
1118 
1119   PetscFunctionBegin;
1120   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1121   if (aijkok) { /* Only zero the device if data is already there */
1122     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1123     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1124   } else { /* Might be preallocated but not assembled */
1125     PetscCall(MatZeroEntries_SeqAIJ(A));
1126   }
1127   PetscFunctionReturn(PETSC_SUCCESS);
1128 }
1129 
1130 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1131 {
1132   Mat_SeqAIJKokkos     *aijkok;
1133   PetscInt              n;
1134   PetscScalarKokkosView xv;
1135 
1136   PetscFunctionBegin;
1137   PetscCall(VecGetLocalSize(x, &n));
1138   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1139   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1140 
1141   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1142   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1143 
1144   const auto &Aa    = aijkok->a_dual.view_device();
1145   const auto &Ai    = aijkok->i_dual.view_device();
1146   const auto &Adiag = aijkok->diag_dual.view_device();
1147 
1148   PetscCall(VecGetKokkosViewWrite(x, &xv));
1149   Kokkos::parallel_for(
1150     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1151       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1152       else xv(i) = 0;
1153     });
1154   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1155   PetscFunctionReturn(PETSC_SUCCESS);
1156 }
1157 
1158 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1159 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1160 {
1161   Mat_SeqAIJKokkos *aijkok;
1162 
1163   PetscFunctionBegin;
1164   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1165   PetscAssertPointer(kv, 2);
1166   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1167   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1168   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1169   *kv    = aijkok->a_dual.view_device();
1170   PetscFunctionReturn(PETSC_SUCCESS);
1171 }
1172 
1173 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1174 {
1175   PetscFunctionBegin;
1176   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1177   PetscAssertPointer(kv, 2);
1178   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1179   PetscFunctionReturn(PETSC_SUCCESS);
1180 }
1181 
1182 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1183 {
1184   Mat_SeqAIJKokkos *aijkok;
1185 
1186   PetscFunctionBegin;
1187   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1188   PetscAssertPointer(kv, 2);
1189   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1190   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1191   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1192   *kv    = aijkok->a_dual.view_device();
1193   PetscFunctionReturn(PETSC_SUCCESS);
1194 }
1195 
1196 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1197 {
1198   PetscFunctionBegin;
1199   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1200   PetscAssertPointer(kv, 2);
1201   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1202   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1203   PetscFunctionReturn(PETSC_SUCCESS);
1204 }
1205 
1206 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1207 {
1208   Mat_SeqAIJKokkos *aijkok;
1209 
1210   PetscFunctionBegin;
1211   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1212   PetscAssertPointer(kv, 2);
1213   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1214   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1215   *kv    = aijkok->a_dual.view_device();
1216   PetscFunctionReturn(PETSC_SUCCESS);
1217 }
1218 
1219 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1220 {
1221   PetscFunctionBegin;
1222   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1223   PetscAssertPointer(kv, 2);
1224   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1225   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1226   PetscFunctionReturn(PETSC_SUCCESS);
1227 }
1228 
1229 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1230 {
1231   Mat_SeqAIJKokkos *akok;
1232 
1233   PetscFunctionBegin;
1234   auto exec = PetscGetKokkosExecutionSpace();
1235   // Create host copies of the input aij
1236   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1237   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1238   // Don't copy the vals to the host now
1239   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1240 
1241   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1242   // Note we have modified device data so it will copy lazily
1243   a_dual.modify_device();
1244   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1245   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1246 
1247   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1248   PetscCall(MatCreate(comm, A));
1249   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1250   PetscFunctionReturn(PETSC_SUCCESS);
1251 }
1252 
1253 /* Computes Y += alpha X */
1254 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1255 {
1256   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1257   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1258   ConstMatScalarKokkosView Xa;
1259   MatScalarKokkosView      Ya;
1260   auto                     exec = PetscGetKokkosExecutionSpace();
1261 
1262   PetscFunctionBegin;
1263   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1264   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1265   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1266   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1267   PetscCall(PetscLogGpuTimeBegin());
1268 
1269   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1270     PetscBool e;
1271     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1272     if (e) {
1273       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1274       if (e) pattern = SAME_NONZERO_PATTERN;
1275     }
1276   }
1277 
1278   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1279     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1280     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1281     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1282   */
1283   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1284   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1285   Xa   = xkok->a_dual.view_device();
1286   Ya   = ykok->a_dual.view_device();
1287 
1288   if (pattern == SAME_NONZERO_PATTERN) {
1289     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1290     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1291   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1292     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1293     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1294 
1295     Kokkos::parallel_for(
1296       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1297         PetscInt i = t.league_rank(); // row i
1298         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1299           // Only one thread works in a team
1300           PetscInt p, q = Yi(i);
1301           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1302             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1303             if (Xj(p) == Yj(q)) {                        // Found it
1304               Ya(q) += alpha * Xa(p);
1305               q++;
1306             } else {
1307             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1308             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1309 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1310               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1311 #else
1312               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1313 #endif
1314             }
1315           }
1316         });
1317       });
1318     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1319   } else { // different nonzero patterns
1320     Mat             Z;
1321     KokkosCsrMatrix zcsr;
1322     KernelHandle    kh;
1323     kh.create_spadd_handle(true); // X, Y are sorted
1324     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1325     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1326     zkok = new Mat_SeqAIJKokkos(zcsr);
1327     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1328     PetscCall(MatHeaderReplace(Y, &Z));
1329     kh.destroy_spadd_handle();
1330   }
1331   PetscCall(PetscLogGpuTimeEnd());
1332   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1333   PetscFunctionReturn(PETSC_SUCCESS);
1334 }
1335 
1336 struct MatCOOStruct_SeqAIJKokkos {
1337   PetscCount           n;
1338   PetscCount           Atot;
1339   PetscInt             nz;
1340   PetscCountKokkosView jmap;
1341   PetscCountKokkosView perm;
1342 
1343   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1344   {
1345     nz   = coo_h->nz;
1346     n    = coo_h->n;
1347     Atot = coo_h->Atot;
1348     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1349     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1350   }
1351 };
1352 
1353 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1354 {
1355   PetscFunctionBegin;
1356   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1357   PetscFunctionReturn(PETSC_SUCCESS);
1358 }
1359 
1360 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1361 {
1362   Mat_SeqAIJKokkos          *akok;
1363   Mat_SeqAIJ                *aseq;
1364   PetscContainer             container_h;
1365   MatCOOStruct_SeqAIJ       *coo_h;
1366   MatCOOStruct_SeqAIJKokkos *coo_d;
1367 
1368   PetscFunctionBegin;
1369   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1370   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1371   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1372   delete akok;
1373   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1374   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1375 
1376   // Copy the COO struct to device
1377   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1378   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1379   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1380 
1381   // Put the COO struct in a container and then attach that to the matrix
1382   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1383   PetscFunctionReturn(PETSC_SUCCESS);
1384 }
1385 
1386 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1387 {
1388   MatScalarKokkosView        Aa;
1389   ConstMatScalarKokkosView   kv;
1390   PetscMemType               memtype;
1391   PetscContainer             container;
1392   MatCOOStruct_SeqAIJKokkos *coo;
1393 
1394   PetscFunctionBegin;
1395   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1396   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1397 
1398   const auto &n    = coo->n;
1399   const auto &Annz = coo->nz;
1400   const auto &jmap = coo->jmap;
1401   const auto &perm = coo->perm;
1402 
1403   PetscCall(PetscGetMemType(v, &memtype));
1404   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1405     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1406   } else {
1407     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1408   }
1409 
1410   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1411   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1412 
1413   PetscCall(PetscLogGpuTimeBegin());
1414   Kokkos::parallel_for(
1415     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1416       PetscScalar sum = 0.0;
1417       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1418       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1419     });
1420   PetscCall(PetscLogGpuTimeEnd());
1421 
1422   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1423   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1424   PetscFunctionReturn(PETSC_SUCCESS);
1425 }
1426 
1427 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1428 {
1429   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1430 
1431   PetscFunctionBegin;
1432   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1433   A->boundtocpu  = PETSC_FALSE;
1434 
1435   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1436   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1437   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1438   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1439   A->ops->scale                     = MatScale_SeqAIJKokkos;
1440   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1441   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1442   A->ops->mult                      = MatMult_SeqAIJKokkos;
1443   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1444   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1445   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1446   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1447   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1448   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1449   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1450   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1451   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1452   A->ops->shift                     = MatShift_SeqAIJKokkos;
1453   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1454   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1455   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1456   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1457   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1458   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1459   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1460   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1461   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1462 
1463   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1464   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1465 #if defined(PETSC_HAVE_HYPRE)
1466   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1467 #endif
1468   PetscFunctionReturn(PETSC_SUCCESS);
1469 }
1470 
1471 /*
1472    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1473 
1474   Input Parameters:
1475 +  A       - the MATSEQAIJKOKKOS matrix
1476 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1477 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1478 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1479 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1480 
1481   Output Parameter:
1482 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1483 */
1484 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1485 {
1486   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1487   PetscInt          nblocks = bs.extent(0) - 1;
1488 
1489   PetscFunctionBegin;
1490   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1491 
1492   // Pull out the diagonal blocks of the matrix and then invert the blocks
1493   auto Aa    = akok->a_dual.view_device();
1494   auto Ai    = akok->i_dual.view_device();
1495   auto Aj    = akok->j_dual.view_device();
1496   auto Adiag = akok->diag_dual.view_device();
1497   // TODO: how to tune the team size?
1498 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1499   auto ts = Kokkos::AUTO();
1500 #else
1501   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1502 #endif
1503   PetscCallCXX(Kokkos::parallel_for(
1504     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1505       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1506       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1507       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1508       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1509       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1510 
1511       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1512         PetscInt i = rstart + r;                                                            // i-th row in A
1513 
1514         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1515           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1516 
1517           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1518             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1519               B(r, c) = 0.0;
1520             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1521               B(r, c) = Aa(first + c);
1522             } else { // this entry does not show up in the CSR
1523               B(r, c) = 0.0;
1524             }
1525           }
1526         } else { // rare case that the diagonal does not exist
1527           const PetscInt begin = Ai(i);
1528           const PetscInt end   = Ai(i + 1);
1529           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1530           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1531             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1532             else if (Aj(j) >= rstart + m) break;
1533           }
1534         }
1535       });
1536 
1537       // LU-decompose B (w/o pivoting) and then invert B
1538       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1539       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1540     }));
1541   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1542   PetscFunctionReturn(PETSC_SUCCESS);
1543 }
1544 
1545 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1546 {
1547   Mat_SeqAIJ *aseq;
1548   PetscInt    i, m, n;
1549   auto        exec = PetscGetKokkosExecutionSpace();
1550 
1551   PetscFunctionBegin;
1552   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1553 
1554   m = akok->nrows();
1555   n = akok->ncols();
1556   PetscCall(MatSetSizes(A, m, n, m, n));
1557   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1558 
1559   /* Set up data structures of A as a MATSEQAIJ */
1560   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1561   aseq = (Mat_SeqAIJ *)A->data;
1562 
1563   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1564   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1565 
1566   aseq->i       = akok->i_host_data();
1567   aseq->j       = akok->j_host_data();
1568   aseq->a       = akok->a_host_data();
1569   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1570   aseq->free_a  = PETSC_FALSE;
1571   aseq->free_ij = PETSC_FALSE;
1572   aseq->nz      = akok->nnz();
1573   aseq->maxnz   = aseq->nz;
1574 
1575   PetscCall(PetscMalloc1(m, &aseq->imax));
1576   PetscCall(PetscMalloc1(m, &aseq->ilen));
1577   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1578 
1579   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1580   akok->nonzerostate = A->nonzerostate;
1581   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1582   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1583   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1584   PetscFunctionReturn(PETSC_SUCCESS);
1585 }
1586 
1587 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1588 {
1589   PetscFunctionBegin;
1590   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1591   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1592   PetscFunctionReturn(PETSC_SUCCESS);
1593 }
1594 
1595 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1596 {
1597   Mat_SeqAIJKokkos *akok;
1598 
1599   PetscFunctionBegin;
1600   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1601   PetscCall(MatCreate(comm, A));
1602   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1603   PetscFunctionReturn(PETSC_SUCCESS);
1604 }
1605 
1606 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1607 
1608    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1609  */
1610 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1611 {
1612   PetscFunctionBegin;
1613   PetscCall(MatCreate(comm, A));
1614   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1615   PetscFunctionReturn(PETSC_SUCCESS);
1616 }
1617 
1618 /*@C
1619   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1620   (the default parallel PETSc format). This matrix will ultimately be handled by
1621   Kokkos for calculations.
1622 
1623   Collective
1624 
1625   Input Parameters:
1626 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1627 . m    - number of rows
1628 . n    - number of columns
1629 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1630 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1631 
1632   Output Parameter:
1633 . A - the matrix
1634 
1635   Level: intermediate
1636 
1637   Notes:
1638   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1639   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1640   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1641 
1642   The AIJ format, also called
1643   compressed row storage, is fully compatible with standard Fortran
1644   storage.  That is, the stored row and column indices can begin at
1645   either one (as in Fortran) or zero.
1646 
1647   Specify the preallocated storage with either `nz` or `nnz` (not both).
1648   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1649   allocation.
1650 
1651 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1652 @*/
1653 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1654 {
1655   PetscFunctionBegin;
1656   PetscCall(PetscKokkosInitializeCheck());
1657   PetscCall(MatCreate(comm, A));
1658   PetscCall(MatSetSizes(*A, m, n, m, n));
1659   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1660   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1661   PetscFunctionReturn(PETSC_SUCCESS);
1662 }
1663 
1664 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1665 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1666 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1667 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1668 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1669 {
1670   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1671   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1672   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1673 
1674   PetscFunctionBegin;
1675   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1676     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1677     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1678     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1679   }
1680   PetscFunctionReturn(PETSC_SUCCESS);
1681 }
1682 
1683 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1684 {
1685   const PetscInt              n         = A->rmap->n;
1686   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1687   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1688   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1689 
1690   PetscFunctionBegin;
1691   if (!factors->transpose_updated) {
1692     if (has_upper) {
1693       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1694         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1695         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1696         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1697       }
1698 
1699       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1700         if (!factors->U) {
1701           Mat_SeqAIJ *seq;
1702 
1703           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1704           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1705 
1706           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1707           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1708           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1709           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1710         } else {
1711           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1712         }
1713         // Copy Ut from host to device
1714         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1715         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1716         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1717       } else { // If U was computed on device, we also compute the transpose there
1718         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1719         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1720                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1721                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1722                                                                                                                                                                                                                                factors->aUt_d));
1723         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1724       }
1725       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1726     }
1727 
1728     // do the same for L with LU
1729     if (has_lower) {
1730       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1731         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1732         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1733         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1734       }
1735 
1736       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1737         if (!factors->L) {
1738           Mat_SeqAIJ *seq;
1739 
1740           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1741           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1742 
1743           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1744           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1745           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1746           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1747         } else {
1748           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1749         }
1750         // Copy Lt from host to device
1751         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1752         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1753         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1754       } else { // If L was computed on device, we also compute the transpose there
1755         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1756         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1757                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1758                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1759                                                                                                                                                                                                                                factors->aLt_d));
1760         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1761       }
1762       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1763     }
1764 
1765     factors->transpose_updated = PETSC_TRUE;
1766   }
1767   PetscFunctionReturn(PETSC_SUCCESS);
1768 }
1769 
1770 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1771 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1772 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1773 {
1774   auto                        exec    = PetscGetKokkosExecutionSpace();
1775   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1776   PetscInt                    m       = A->rmap->n;
1777   PetscScalarKokkosView       D       = factors->D_d;
1778   PetscScalarKokkosView       X, Y, B; // alias
1779   ConstPetscScalarKokkosView  b;
1780   PetscScalarKokkosView       x;
1781   PetscIntKokkosView         &rowperm  = factors->rowperm;
1782   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1783 
1784   PetscFunctionBegin;
1785   PetscCall(PetscLogGpuTimeBegin());
1786   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1787   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1788   PetscCall(VecGetKokkosView(bb, &b));
1789   PetscCall(VecGetKokkosViewWrite(xx, &x));
1790 
1791   // Solve U^T Y = B
1792   if (identity) { // Reorder b with the row permutation
1793     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1794     Y = factors->workVector;
1795   } else {
1796     B = factors->workVector;
1797     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1798     Y = x;
1799   }
1800   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1801 
1802   // Solve diag(D) Y' = Y.
1803   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1804   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1805 
1806   // Solve UX = Y
1807   if (identity) {
1808     X = x;
1809   } else {
1810     X = factors->workVector; // B is not needed anymore
1811   }
1812   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1813 
1814   // Reorder X with the inverse column (row) permutation
1815   if (!identity) {
1816     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1817   }
1818 
1819   PetscCall(VecRestoreKokkosView(bb, &b));
1820   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1821   PetscCall(PetscLogGpuTimeEnd());
1822   PetscFunctionReturn(PETSC_SUCCESS);
1823 }
1824 
1825 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1826 // R and C are represented by rowperm and colperm in factors.
1827 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1828 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1829 {
1830   auto                        exec    = PetscGetKokkosExecutionSpace();
1831   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1832   PetscInt                    m       = A->rmap->n;
1833   PetscScalarKokkosView       X, Y, B; // alias
1834   ConstPetscScalarKokkosView  b;
1835   PetscScalarKokkosView       x;
1836   PetscIntKokkosView         &rowperm      = factors->rowperm;
1837   PetscIntKokkosView         &colperm      = factors->colperm;
1838   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1839   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1840 
1841   PetscFunctionBegin;
1842   PetscCall(PetscLogGpuTimeBegin());
1843   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1844   PetscCall(VecGetKokkosView(bb, &b));
1845   PetscCall(VecGetKokkosViewWrite(xx, &x));
1846 
1847   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1848   if (row_identity) {
1849     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1850     Y = factors->workVector;
1851   } else {
1852     B = factors->workVector;
1853     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1854     Y = x;
1855   }
1856   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1857 
1858   // Solve U C^- x = Y
1859   if (col_identity) {
1860     X = x;
1861   } else {
1862     X = factors->workVector;
1863   }
1864   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1865 
1866   // x = C X; Reorder X with the inverse col permutation
1867   if (!col_identity) {
1868     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1869   }
1870 
1871   PetscCall(VecRestoreKokkosView(bb, &b));
1872   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1873   PetscCall(PetscLogGpuTimeEnd());
1874   PetscFunctionReturn(PETSC_SUCCESS);
1875 }
1876 
1877 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1878 // R and C are represented by rowperm and colperm in factors.
1879 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1880 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1881 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1882 {
1883   auto                        exec    = PetscGetKokkosExecutionSpace();
1884   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1885   PetscInt                    m       = A->rmap->n;
1886   PetscScalarKokkosView       X, Y, B; // alias
1887   ConstPetscScalarKokkosView  b;
1888   PetscScalarKokkosView       x;
1889   PetscIntKokkosView         &rowperm      = factors->rowperm;
1890   PetscIntKokkosView         &colperm      = factors->colperm;
1891   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1892   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1893 
1894   PetscFunctionBegin;
1895   PetscCall(PetscLogGpuTimeBegin());
1896   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1897   PetscCall(VecGetKokkosView(bb, &b));
1898   PetscCall(VecGetKokkosViewWrite(xx, &x));
1899 
1900   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1901   if (col_identity) { // Reorder b with the col permutation
1902     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1903     Y = factors->workVector;
1904   } else {
1905     B = factors->workVector;
1906     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1907     Y = x;
1908   }
1909   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1910 
1911   // Solve L^T X = Y
1912   if (row_identity) {
1913     X = x;
1914   } else {
1915     X = factors->workVector;
1916   }
1917   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1918 
1919   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1920   if (!row_identity) {
1921     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1922   }
1923 
1924   PetscCall(VecRestoreKokkosView(bb, &b));
1925   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1926   PetscCall(PetscLogGpuTimeEnd());
1927   PetscFunctionReturn(PETSC_SUCCESS);
1928 }
1929 
1930 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1931 {
1932   PetscFunctionBegin;
1933   PetscCall(MatSeqAIJKokkosSyncHost(A));
1934   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1935 
1936   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1937     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1938     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1939     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1940     const MatScalar            *Ba = b->a;
1941     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1942 
1943     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1944       // Allocate memory and copy the structure
1945       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1946       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1947       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1948       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1949       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1950       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1951 
1952       PetscInt *Li = factors->iL_h.data();
1953       PetscInt *Lj = factors->jL_h.data();
1954       PetscInt *Ui = factors->iU_h.data();
1955       PetscInt *Uj = factors->jU_h.data();
1956 
1957       Li[0] = Ui[0] = 0;
1958       for (PetscInt i = 0; i < m; i++) {
1959         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1960         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1961 
1962         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1963         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1964 
1965         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1966         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1967 
1968         Li[i + 1] = Li[i] + llen + 1;
1969         Ui[i + 1] = Ui[i] + ulen;
1970       }
1971 
1972       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1973       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1974       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1975       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1976       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1977       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1978 
1979       // Copy row/col permutation to device
1980       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1981       PetscBool row_identity;
1982       PetscCall(ISIdentity(rowperm, &row_identity));
1983       if (!row_identity) {
1984         const PetscInt *ip;
1985 
1986         PetscCall(ISGetIndices(rowperm, &ip));
1987         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1988         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1989         PetscCall(ISRestoreIndices(rowperm, &ip));
1990         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1991       }
1992 
1993       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1994       PetscBool col_identity;
1995       PetscCall(ISIdentity(colperm, &col_identity));
1996       if (!col_identity) {
1997         const PetscInt *ip;
1998 
1999         PetscCall(ISGetIndices(colperm, &ip));
2000         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2001         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2002         PetscCall(ISRestoreIndices(colperm, &ip));
2003         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2004       }
2005 
2006       /* Create sptrsv handles for L, U and their transpose */
2007 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2008       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2009 #else
2010       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2011 #endif
2012       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2013       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2014       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2015       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2016     }
2017 
2018     // Copy the value
2019     for (PetscInt i = 0; i < m; i++) {
2020       PetscInt        llen = Bi[i + 1] - Bi[i];
2021       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2022       const PetscInt *Li   = factors->iL_h.data();
2023       const PetscInt *Ui   = factors->iU_h.data();
2024 
2025       PetscScalar *La = factors->aL_h.data();
2026       PetscScalar *Ua = factors->aU_h.data();
2027 
2028       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2029       La[Li[i] + llen] = 1.0;                      // diagonal entry
2030 
2031       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2032       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2033     }
2034 
2035     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2036     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2037     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2038     factors->transpose_updated         = PETSC_FALSE;
2039     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2040 
2041     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2042     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2043   }
2044 
2045   B->ops->matsolve          = NULL;
2046   B->ops->matsolvetranspose = NULL;
2047   PetscFunctionReturn(PETSC_SUCCESS);
2048 }
2049 
2050 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2051 {
2052   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2053   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2054   PetscInt                    fill_lev = info->levels;
2055 
2056   PetscFunctionBegin;
2057   PetscCall(PetscLogGpuTimeBegin());
2058   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2059   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2060 
2061   auto a_d = aijkok->a_dual.view_device();
2062   auto i_d = aijkok->i_dual.view_device();
2063   auto j_d = aijkok->j_dual.view_device();
2064 
2065   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2066 
2067   B->assembled              = PETSC_TRUE;
2068   B->preallocated           = PETSC_TRUE;
2069   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2070   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2071   B->ops->matsolve          = NULL;
2072   B->ops->matsolvetranspose = NULL;
2073 
2074   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2075   factors->transpose_updated         = PETSC_FALSE;
2076   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2077   /* TODO: log flops, but how to know that? */
2078   PetscCall(PetscLogGpuTimeEnd());
2079   PetscFunctionReturn(PETSC_SUCCESS);
2080 }
2081 
2082 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2083 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2084 {
2085   Mat_SeqAIJKokkos           *aijkok;
2086   Mat_SeqAIJ                 *b;
2087   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2088   PetscInt                    fill_lev = info->levels;
2089   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2090   PetscInt                    n        = A->rmap->n;
2091 
2092   PetscFunctionBegin;
2093   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2094   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2095 
2096   /* Create a spiluk handle and then do symbolic factorization */
2097   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2098   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2099 
2100   auto spiluk_handle = factors->kh.get_spiluk_handle();
2101 
2102   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2103   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2104   Kokkos::realloc(factors->iU_d, n + 1);
2105   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2106 
2107   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2108   auto i_d = aijkok->i_dual.view_device();
2109   auto j_d = aijkok->j_dual.view_device();
2110   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2111   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2112 
2113   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2114   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2115   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2116   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2117 
2118   /* TODO: add options to select sptrsv algorithms */
2119   /* Create sptrsv handles for L, U and their transpose */
2120 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2121   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2122 #else
2123   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2124 #endif
2125 
2126   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2127   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2128   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2129   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2130 
2131   /* Fill fields of the factor matrix B */
2132   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2133   b     = (Mat_SeqAIJ *)B->data;
2134   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2135   B->info.fill_ratio_given  = info->fill;
2136   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2137 
2138   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2139   PetscFunctionReturn(PETSC_SUCCESS);
2140 }
2141 
2142 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2143 {
2144   PetscFunctionBegin;
2145   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2146   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2147   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2148   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2149   PetscFunctionReturn(PETSC_SUCCESS);
2150 }
2151 
2152 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2153 {
2154   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2155 
2156   PetscFunctionBegin;
2157   if (!info->factoronhost) {
2158     PetscCall(ISIdentity(isrow, &row_identity));
2159     PetscCall(ISIdentity(iscol, &col_identity));
2160   }
2161 
2162   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2163   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2164 
2165   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2166     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2167   } else {
2168     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2169     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2170   }
2171   PetscFunctionReturn(PETSC_SUCCESS);
2172 }
2173 
2174 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2175 {
2176   PetscFunctionBegin;
2177   PetscCall(MatSeqAIJKokkosSyncHost(A));
2178   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2179 
2180   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2181     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2182     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2183     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2184     const MatScalar            *Ba = b->a;
2185     PetscInt                    m  = B->rmap->n;
2186 
2187     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2188       // Allocate memory and copy the structure
2189       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2190       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2191       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2192       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2193       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2194       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2195 
2196       // Build jU_h from the skewed Aj
2197       PetscInt *Uj = factors->jU_h.data();
2198       for (PetscInt i = 0; i < m; i++) {
2199         PetscInt ulen = Bi[i + 1] - Bi[i];
2200         Uj[Bi[i]]     = i;                                              // diagonal entry
2201         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2202       }
2203 
2204       // Copy iU, jU to device
2205       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2206       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2207 
2208       // Copy row/col permutation to device
2209       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2210       PetscBool row_identity;
2211       PetscCall(ISIdentity(rowperm, &row_identity));
2212       if (!row_identity) {
2213         const PetscInt *ip;
2214 
2215         PetscCall(ISGetIndices(rowperm, &ip));
2216         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2217         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2218         PetscCall(ISRestoreIndices(rowperm, &ip));
2219         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2220       }
2221 
2222       // Create sptrsv handles for U and U^T
2223 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2224       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2225 #else
2226       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2227 #endif
2228       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2229       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2230     }
2231     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2232     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2233     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2234 
2235     // Copy the value
2236     PetscScalar *Ua = factors->aU_h.data();
2237     PetscScalar *D  = factors->D_h.data();
2238     for (PetscInt i = 0; i < m; i++) {
2239       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2240       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2241       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2242     }
2243     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2244     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2245 
2246     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2247     factors->transpose_updated         = PETSC_FALSE;
2248   }
2249 
2250   B->ops->matsolve          = NULL;
2251   B->ops->matsolvetranspose = NULL;
2252   PetscFunctionReturn(PETSC_SUCCESS);
2253 }
2254 
2255 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2256 {
2257   PetscFunctionBegin;
2258   if (info->solveonhost) {
2259     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2260     PetscCall(MatSetType(B, MATSEQSBAIJ));
2261     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2262   }
2263 
2264   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2265 
2266   if (!info->solveonhost) {
2267     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2268     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2269     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2270     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2271   }
2272   PetscFunctionReturn(PETSC_SUCCESS);
2273 }
2274 
2275 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2276 {
2277   PetscFunctionBegin;
2278   if (info->solveonhost) {
2279     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2280     PetscCall(MatSetType(B, MATSEQSBAIJ));
2281     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2282   }
2283 
2284   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2285 
2286   if (!info->solveonhost) {
2287     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2288     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2289     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2290     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2291   }
2292   PetscFunctionReturn(PETSC_SUCCESS);
2293 }
2294 
2295 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2296 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2297 {
2298   PetscFunctionBegin;
2299   *type = MATSOLVERKOKKOS;
2300   PetscFunctionReturn(PETSC_SUCCESS);
2301 }
2302 
2303 /*MC
2304   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2305   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2306 
2307   Level: beginner
2308 
2309 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2310 M*/
2311 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2312 {
2313   PetscInt n = A->rmap->n;
2314   MPI_Comm comm;
2315 
2316   PetscFunctionBegin;
2317   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2318   PetscCall(MatCreate(comm, B));
2319   PetscCall(MatSetSizes(*B, n, n, n, n));
2320   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2321   (*B)->factortype = ftype;
2322   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2323   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2324   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2325 
2326   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2327     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2328     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2329     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2330     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2331     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2332   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2333     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2334     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2335     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2336     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2337   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2338 
2339   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2340   (*B)->canuseordering = PETSC_TRUE;
2341   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2342   PetscFunctionReturn(PETSC_SUCCESS);
2343 }
2344 
2345 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2346 {
2347   PetscFunctionBegin;
2348   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2349   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2350   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2351   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2352   PetscFunctionReturn(PETSC_SUCCESS);
2353 }
2354 
2355 /* Utility to print out a KokkosCsrMatrix for debugging */
2356 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2357 {
2358   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2359   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2360   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2361   const PetscInt    *i  = iv.data();
2362   const PetscInt    *j  = jv.data();
2363   const PetscScalar *a  = av.data();
2364   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2365 
2366   PetscFunctionBegin;
2367   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2368   for (PetscInt k = 0; k < m; k++) {
2369     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2370     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2371     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2372   }
2373   PetscFunctionReturn(PETSC_SUCCESS);
2374 }
2375