xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 2cdf5ea42bccd4e651ec69c5d7cf37657be83b41)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20 #include <KokkosSparse_spmv.hpp>
21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22 
23 #include <KokkosSparse_spiluk.hpp>
24 #include <KokkosSparse_sptrsv.hpp>
25 #include <KokkosSparse_spgemm.hpp>
26 #include <KokkosSparse_spadd.hpp>
27 #include <KokkosBatched_LU_Decl.hpp>
28 #include <KokkosBatched_InverseLU_Decl.hpp>
29 
30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31 
32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33   #include <KokkosSparse_Utils.hpp>
34 using KokkosSparse::sort_crs_matrix;
35 using KokkosSparse::Impl::transpose_matrix;
36 #else
37   #include <KokkosKernels_Sorting.hpp>
38 using KokkosKernels::sort_crs_matrix;
39 using KokkosKernels::Impl::transpose_matrix;
40 #endif
41 
42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43 using KokkosSparse::spiluk_symbolic;
44 using KokkosSparse::spiluk_numeric;
45 using KokkosSparse::sptrsv_symbolic;
46 using KokkosSparse::sptrsv_solve;
47 using KokkosSparse::Experimental::SPTRSVAlgorithm;
48 using KokkosSparse::Experimental::SPILUKAlgorithm;
49 #else
50 using KokkosSparse::Experimental::spiluk_symbolic;
51 using KokkosSparse::Experimental::spiluk_numeric;
52 using KokkosSparse::Experimental::sptrsv_symbolic;
53 using KokkosSparse::Experimental::sptrsv_solve;
54 using KokkosSparse::Experimental::SPTRSVAlgorithm;
55 using KokkosSparse::Experimental::SPILUKAlgorithm;
56 #endif
57 
58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59 
60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62    In the latter case, it is important to set a_dual's sync state correctly.
63  */
64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65 {
66   Mat_SeqAIJ       *aijseq;
67   Mat_SeqAIJKokkos *aijkok;
68 
69   PetscFunctionBegin;
70   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72 
73   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75 
76   /* If aijkok does not exist, we just copy i, j to device.
77      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78      In both cases, we build a new aijkok structure.
79   */
80   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81     delete aijkok;
82     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
83     A->spptr = aijkok;
84   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
85     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
86     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
87     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
88   }
89   PetscFunctionReturn(PETSC_SUCCESS);
90 }
91 
92 /* Sync CSR data to device if not yet */
93 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
94 {
95   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
96 
97   PetscFunctionBegin;
98   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
99   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
100   if (aijkok->a_dual.need_sync_device()) {
101     aijkok->a_dual.sync_device();
102     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
103     aijkok->hermitian_updated = PETSC_FALSE;
104   }
105   PetscFunctionReturn(PETSC_SUCCESS);
106 }
107 
108 /* Mark the CSR data on device as modified */
109 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
110 {
111   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
112 
113   PetscFunctionBegin;
114   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
115   aijkok->a_dual.clear_sync_state();
116   aijkok->a_dual.modify_device();
117   aijkok->transpose_updated = PETSC_FALSE;
118   aijkok->hermitian_updated = PETSC_FALSE;
119   PetscCall(MatSeqAIJInvalidateDiagonal(A));
120   PetscCall(PetscObjectStateIncrease((PetscObject)A));
121   PetscFunctionReturn(PETSC_SUCCESS);
122 }
123 
124 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127   auto              exec   = PetscGetKokkosExecutionSpace();
128 
129   PetscFunctionBegin;
130   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
131   /* We do not expect one needs factors on host  */
132   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
133   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
134   PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(aijkok->a_dual, exec));
135   PetscFunctionReturn(PETSC_SUCCESS);
136 }
137 
138 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
139 {
140   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
141 
142   PetscFunctionBegin;
143   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
144     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
145     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
146     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
147   */
148   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149     auto exec = PetscGetKokkosExecutionSpace();
150     PetscCallCXX(aijkok->a_dual.sync_host(exec));
151     PetscCallCXX(exec.fence());
152     *array = aijkok->a_dual.view_host().data();
153   } else { /* Happens when calling MatSetValues on a newly created matrix */
154     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
155   }
156   PetscFunctionReturn(PETSC_SUCCESS);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
160 {
161   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
162 
163   PetscFunctionBegin;
164   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
165   PetscFunctionReturn(PETSC_SUCCESS);
166 }
167 
168 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
169 {
170   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
171 
172   PetscFunctionBegin;
173   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
174     auto exec = PetscGetKokkosExecutionSpace();
175     PetscCallCXX(aijkok->a_dual.sync_host(exec));
176     PetscCallCXX(exec.fence());
177     *array = aijkok->a_dual.view_host().data();
178   } else {
179     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
180   }
181   PetscFunctionReturn(PETSC_SUCCESS);
182 }
183 
184 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
185 {
186   PetscFunctionBegin;
187   *array = NULL;
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
192 {
193   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
194 
195   PetscFunctionBegin;
196   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
197     *array = aijkok->a_dual.view_host().data();
198   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
199     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
200   }
201   PetscFunctionReturn(PETSC_SUCCESS);
202 }
203 
204 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
205 {
206   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
207 
208   PetscFunctionBegin;
209   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
210     aijkok->a_dual.clear_sync_state();
211     aijkok->a_dual.modify_host();
212   }
213   PetscFunctionReturn(PETSC_SUCCESS);
214 }
215 
216 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
217 {
218   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
219 
220   PetscFunctionBegin;
221   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
222 
223   if (i) *i = aijkok->i_device_data();
224   if (j) *j = aijkok->j_device_data();
225   if (a) {
226     aijkok->a_dual.sync_device();
227     *a = aijkok->a_device_data();
228   }
229   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
230   PetscFunctionReturn(PETSC_SUCCESS);
231 }
232 
233 /*
234   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
235 
236   Input Parameter:
237 .  A       - the MATSEQAIJKOKKOS matrix
238 
239   Output Parameters:
240 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
241 -  T_d    - the transpose on device, whose value array is allocated but not initialized
242 */
243 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
244 {
245   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
246   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
247   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
248   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
249   MatRowMapType          *Ti = Ti_h.data();
250   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
251   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
252   PetscInt               *Tj   = Tj_h.data();
253   PetscInt               *perm = perm_h.data();
254   PetscInt               *offset;
255 
256   PetscFunctionBegin;
257   // Populate Ti
258   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
259   Ti++;
260   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
261   Ti--;
262   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
263 
264   // Populate Tj and the permutation array
265   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
266   for (PetscInt i = 0; i < m; i++) {
267     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
268       PetscInt r    = Aj[j];                       // row r of T
269       PetscInt disp = Ti[r] + offset[r];
270 
271       Tj[disp]   = i; // col i of T
272       perm[disp] = j;
273       offset[r]++;
274     }
275   }
276   PetscCall(PetscFree(offset));
277 
278   // Sort each row of T, along with the permutation array
279   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
280 
281   // Output perm and T on device
282   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
283   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
284   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
285   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
286   PetscFunctionReturn(PETSC_SUCCESS);
287 }
288 
289 // Generate the transpose on device and cache it internally
290 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
291 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
292 {
293   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
294   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
295   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
296   KokkosCsrMatrix  &T = akok->csrmatT;
297 
298   PetscFunctionBegin;
299   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
300   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
301 
302   const auto &Aa = akok->a_dual.view_device();
303 
304   if (A->symmetric == PETSC_BOOL3_TRUE) {
305     *csrmatT = akok->csrmat;
306   } else {
307     // See if we already have a cached transpose and its value is up to date
308     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
309       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
310         const auto &perm = akok->transpose_perm; // get the permutation array
311         auto       &Ta   = T.values;
312 
313         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
314       }
315     } else { // Generate T of size n x m for the first time
316       MatRowMapKokkosView perm;
317 
318       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
319       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
320       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
321     }
322     akok->transpose_updated = PETSC_TRUE;
323     *csrmatT                = akok->csrmatT;
324   }
325   PetscFunctionReturn(PETSC_SUCCESS);
326 }
327 
328 // Generate the Hermitian on device and cache it internally
329 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
330 {
331   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
332   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
333   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
334   KokkosCsrMatrix  &T = akok->csrmatH;
335 
336   PetscFunctionBegin;
337   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
338   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
339 
340   const auto &Aa = akok->a_dual.view_device();
341 
342   if (A->hermitian == PETSC_BOOL3_TRUE) {
343     *csrmatH = akok->csrmat;
344   } else {
345     // See if we already have a cached hermitian and its value is up to date
346     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
347       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
348         const auto &perm = akok->transpose_perm; // get the permutation array
349         auto       &Ta   = T.values;
350 
351         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
352       }
353     } else { // Generate T of size n x m for the first time
354       MatRowMapKokkosView perm;
355 
356       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
357       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
358       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
359     }
360     akok->hermitian_updated = PETSC_TRUE;
361     *csrmatH                = akok->csrmatH;
362   }
363   PetscFunctionReturn(PETSC_SUCCESS);
364 }
365 
366 /* y = A x */
367 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
368 {
369   Mat_SeqAIJKokkos          *aijkok;
370   ConstPetscScalarKokkosView xv;
371   PetscScalarKokkosView      yv;
372 
373   PetscFunctionBegin;
374   PetscCall(PetscLogGpuTimeBegin());
375   PetscCall(MatSeqAIJKokkosSyncDevice(A));
376   PetscCall(VecGetKokkosView(xx, &xv));
377   PetscCall(VecGetKokkosViewWrite(yy, &yv));
378   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
379   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
380   PetscCall(VecRestoreKokkosView(xx, &xv));
381   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
382   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
383   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
384   PetscCall(PetscLogGpuTimeEnd());
385   PetscFunctionReturn(PETSC_SUCCESS);
386 }
387 
388 /* y = A^T x */
389 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
390 {
391   Mat_SeqAIJKokkos          *aijkok;
392   const char                *mode;
393   ConstPetscScalarKokkosView xv;
394   PetscScalarKokkosView      yv;
395   KokkosCsrMatrix            csrmat;
396 
397   PetscFunctionBegin;
398   PetscCall(PetscLogGpuTimeBegin());
399   PetscCall(MatSeqAIJKokkosSyncDevice(A));
400   PetscCall(VecGetKokkosView(xx, &xv));
401   PetscCall(VecGetKokkosViewWrite(yy, &yv));
402   if (A->form_explicit_transpose) {
403     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
404     mode = "N";
405   } else {
406     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
407     csrmat = aijkok->csrmat;
408     mode   = "T";
409   }
410   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
411   PetscCall(VecRestoreKokkosView(xx, &xv));
412   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
413   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
414   PetscCall(PetscLogGpuTimeEnd());
415   PetscFunctionReturn(PETSC_SUCCESS);
416 }
417 
418 /* y = A^H x */
419 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
420 {
421   Mat_SeqAIJKokkos          *aijkok;
422   const char                *mode;
423   ConstPetscScalarKokkosView xv;
424   PetscScalarKokkosView      yv;
425   KokkosCsrMatrix            csrmat;
426 
427   PetscFunctionBegin;
428   PetscCall(PetscLogGpuTimeBegin());
429   PetscCall(MatSeqAIJKokkosSyncDevice(A));
430   PetscCall(VecGetKokkosView(xx, &xv));
431   PetscCall(VecGetKokkosViewWrite(yy, &yv));
432   if (A->form_explicit_transpose) {
433     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
434     mode = "N";
435   } else {
436     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
437     csrmat = aijkok->csrmat;
438     mode   = "C";
439   }
440   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
441   PetscCall(VecRestoreKokkosView(xx, &xv));
442   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
443   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
444   PetscCall(PetscLogGpuTimeEnd());
445   PetscFunctionReturn(PETSC_SUCCESS);
446 }
447 
448 /* z = A x + y */
449 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
450 {
451   Mat_SeqAIJKokkos          *aijkok;
452   ConstPetscScalarKokkosView xv;
453   PetscScalarKokkosView      zv;
454 
455   PetscFunctionBegin;
456   PetscCall(PetscLogGpuTimeBegin());
457   PetscCall(MatSeqAIJKokkosSyncDevice(A));
458   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
459   PetscCall(VecGetKokkosView(xx, &xv));
460   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
461   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
462   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
463   PetscCall(VecRestoreKokkosView(xx, &xv));
464   PetscCall(VecRestoreKokkosView(zz, &zv));
465   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
466   PetscCall(PetscLogGpuTimeEnd());
467   PetscFunctionReturn(PETSC_SUCCESS);
468 }
469 
470 /* z = A^T x + y */
471 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
472 {
473   Mat_SeqAIJKokkos          *aijkok;
474   const char                *mode;
475   ConstPetscScalarKokkosView xv;
476   PetscScalarKokkosView      zv;
477   KokkosCsrMatrix            csrmat;
478 
479   PetscFunctionBegin;
480   PetscCall(PetscLogGpuTimeBegin());
481   PetscCall(MatSeqAIJKokkosSyncDevice(A));
482   if (zz != yy) PetscCall(VecCopy(yy, zz));
483   PetscCall(VecGetKokkosView(xx, &xv));
484   PetscCall(VecGetKokkosView(zz, &zv));
485   if (A->form_explicit_transpose) {
486     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
487     mode = "N";
488   } else {
489     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
490     csrmat = aijkok->csrmat;
491     mode   = "T";
492   }
493   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
494   PetscCall(VecRestoreKokkosView(xx, &xv));
495   PetscCall(VecRestoreKokkosView(zz, &zv));
496   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
497   PetscCall(PetscLogGpuTimeEnd());
498   PetscFunctionReturn(PETSC_SUCCESS);
499 }
500 
501 /* z = A^H x + y */
502 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
503 {
504   Mat_SeqAIJKokkos          *aijkok;
505   const char                *mode;
506   ConstPetscScalarKokkosView xv;
507   PetscScalarKokkosView      zv;
508   KokkosCsrMatrix            csrmat;
509 
510   PetscFunctionBegin;
511   PetscCall(PetscLogGpuTimeBegin());
512   PetscCall(MatSeqAIJKokkosSyncDevice(A));
513   if (zz != yy) PetscCall(VecCopy(yy, zz));
514   PetscCall(VecGetKokkosView(xx, &xv));
515   PetscCall(VecGetKokkosView(zz, &zv));
516   if (A->form_explicit_transpose) {
517     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
518     mode = "N";
519   } else {
520     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
521     csrmat = aijkok->csrmat;
522     mode   = "C";
523   }
524   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
525   PetscCall(VecRestoreKokkosView(xx, &xv));
526   PetscCall(VecRestoreKokkosView(zz, &zv));
527   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
528   PetscCall(PetscLogGpuTimeEnd());
529   PetscFunctionReturn(PETSC_SUCCESS);
530 }
531 
532 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
533 {
534   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
535 
536   PetscFunctionBegin;
537   switch (op) {
538   case MAT_FORM_EXPLICIT_TRANSPOSE:
539     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
540     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
541     A->form_explicit_transpose = flg;
542     break;
543   default:
544     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
545     break;
546   }
547   PetscFunctionReturn(PETSC_SUCCESS);
548 }
549 
550 /* Depending on reuse, either build a new mat, or use the existing mat */
551 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
552 {
553   Mat_SeqAIJ *aseq;
554 
555   PetscFunctionBegin;
556   PetscCall(PetscKokkosInitializeCheck());
557   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
558     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
559   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
560     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
561   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
562     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
563     PetscCall(PetscFree(A->defaultvectype));
564     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
565     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
566     PetscCall(MatSetOps_SeqAIJKokkos(A));
567     aseq = static_cast<Mat_SeqAIJ *>(A->data);
568     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
569       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
570       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
571     }
572   }
573   PetscFunctionReturn(PETSC_SUCCESS);
574 }
575 
576 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
577    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
578  */
579 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
580 {
581   Mat_SeqAIJ       *bseq;
582   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
583   Mat               mat;
584 
585   PetscFunctionBegin;
586   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
587   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
588   mat = *B;
589   if (A->assembled) {
590     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
591     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
592     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
593     /* Now copy values to B if needed */
594     if (dupOption == MAT_COPY_VALUES) {
595       if (akok->a_dual.need_sync_device()) {
596         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
597         bkok->a_dual.modify_host();
598       } else { /* If device has the latest data, we only copy data on device */
599         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
600         bkok->a_dual.modify_device();
601       }
602     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
603       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
604       bkok->a_dual.modify_host();
605     }
606     mat->spptr = bkok;
607   }
608 
609   PetscCall(PetscFree(mat->defaultvectype));
610   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
611   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
612   PetscCall(MatSetOps_SeqAIJKokkos(mat));
613   PetscFunctionReturn(PETSC_SUCCESS);
614 }
615 
616 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
617 {
618   Mat               At;
619   KokkosCsrMatrix   internT;
620   Mat_SeqAIJKokkos *atkok, *bkok;
621 
622   PetscFunctionBegin;
623   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
624   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
625   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
626     /* Deep copy internT, as we want to isolate the internal transpose */
627     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
628     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
629     if (reuse == MAT_INITIAL_MATRIX) *B = At;
630     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
631   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
632     if ((*B)->assembled) {
633       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
634       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
635       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
636     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
637       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
638       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
639       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
640       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
641       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
642     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
643   }
644   PetscFunctionReturn(PETSC_SUCCESS);
645 }
646 
647 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
648 {
649   Mat_SeqAIJKokkos *aijkok;
650 
651   PetscFunctionBegin;
652   if (A->factortype == MAT_FACTOR_NONE) {
653     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
654     delete aijkok;
655   } else {
656     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
657   }
658   A->spptr = NULL;
659   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
660   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
661   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
662 #if defined(PETSC_HAVE_HYPRE)
663   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
664 #endif
665   PetscCall(MatDestroy_SeqAIJ(A));
666   PetscFunctionReturn(PETSC_SUCCESS);
667 }
668 
669 /*MC
670    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
671 
672    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
673 
674    Options Database Key:
675 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
676 
677   Level: beginner
678 
679 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
680 M*/
681 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
682 {
683   PetscFunctionBegin;
684   PetscCall(PetscKokkosInitializeCheck());
685   PetscCall(MatCreate_SeqAIJ(A));
686   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
687   PetscFunctionReturn(PETSC_SUCCESS);
688 }
689 
690 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
691 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
692 {
693   Mat_SeqAIJ         *a, *b;
694   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
695   MatScalarKokkosView aa, ba, ca;
696   MatRowMapKokkosView ai, bi, ci;
697   MatColIdxKokkosView aj, bj, cj;
698   PetscInt            m, n, nnz, aN;
699 
700   PetscFunctionBegin;
701   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
702   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
703   PetscAssertPointer(C, 4);
704   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
705   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
706   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
707   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
708 
709   PetscCall(MatSeqAIJKokkosSyncDevice(A));
710   PetscCall(MatSeqAIJKokkosSyncDevice(B));
711   a    = static_cast<Mat_SeqAIJ *>(A->data);
712   b    = static_cast<Mat_SeqAIJ *>(B->data);
713   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
714   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
715   aa   = akok->a_dual.view_device();
716   ai   = akok->i_dual.view_device();
717   ba   = bkok->a_dual.view_device();
718   bi   = bkok->i_dual.view_device();
719   m    = A->rmap->n; /* M, N and nnz of C */
720   n    = A->cmap->n + B->cmap->n;
721   nnz  = a->nz + b->nz;
722   aN   = A->cmap->n; /* N of A */
723   if (reuse == MAT_INITIAL_MATRIX) {
724     aj           = akok->j_dual.view_device();
725     bj           = bkok->j_dual.view_device();
726     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
727     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
728     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
729     ca           = ca_dual.view_device();
730     ci           = ci_dual.view_device();
731     cj           = cj_dual.view_device();
732 
733     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
734     Kokkos::parallel_for(
735       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
736         PetscInt i       = t.league_rank(); /* row i */
737         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
738 
739         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
740                                                    ci(i) = coffset;
741                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
742         });
743 
744         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
745           if (k < alen) {
746             ca(coffset + k) = aa(ai(i) + k);
747             cj(coffset + k) = aj(ai(i) + k);
748           } else {
749             ca(coffset + k) = ba(bi(i) + k - alen);
750             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
751           }
752         });
753       });
754     ca_dual.modify_device();
755     ci_dual.modify_device();
756     cj_dual.modify_device();
757     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
758     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
759   } else if (reuse == MAT_REUSE_MATRIX) {
760     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
761     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
762     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
763     ca   = ckok->a_dual.view_device();
764     ci   = ckok->i_dual.view_device();
765 
766     Kokkos::parallel_for(
767       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
768         PetscInt i    = t.league_rank(); /* row i */
769         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
770         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
771           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
772           else ca(ci(i) + k) = ba(bi(i) + k - alen);
773         });
774       });
775     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
776   }
777   PetscFunctionReturn(PETSC_SUCCESS);
778 }
779 
780 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
781 {
782   PetscFunctionBegin;
783   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
784   PetscFunctionReturn(PETSC_SUCCESS);
785 }
786 
787 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
788 {
789   Mat_Product                 *product = C->product;
790   Mat                          A, B;
791   bool                         transA, transB; /* use bool, since KK needs this type */
792   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
793   Mat_SeqAIJ                  *c;
794   MatProductData_SeqAIJKokkos *pdata;
795   KokkosCsrMatrix              csrmatA, csrmatB;
796 
797   PetscFunctionBegin;
798   MatCheckProduct(C, 1);
799   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
800   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
801 
802   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
803   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
804   // we still do numeric.
805   if (pdata->reusesym) { // numeric reuses results from symbolic
806     pdata->reusesym = PETSC_FALSE;
807     PetscFunctionReturn(PETSC_SUCCESS);
808   }
809 
810   switch (product->type) {
811   case MATPRODUCT_AB:
812     transA = false;
813     transB = false;
814     break;
815   case MATPRODUCT_AtB:
816     transA = true;
817     transB = false;
818     break;
819   case MATPRODUCT_ABt:
820     transA = false;
821     transB = true;
822     break;
823   default:
824     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
825   }
826 
827   A = product->A;
828   B = product->B;
829   PetscCall(MatSeqAIJKokkosSyncDevice(A));
830   PetscCall(MatSeqAIJKokkosSyncDevice(B));
831   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
832   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
833   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
834 
835   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
836 
837   csrmatA = akok->csrmat;
838   csrmatB = bkok->csrmat;
839 
840   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
841   if (transA) {
842     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
843     transA = false;
844   }
845 
846   if (transB) {
847     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
848     transB = false;
849   }
850   PetscCall(PetscLogGpuTimeBegin());
851   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
852 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
853   auto spgemmHandle = pdata->kh.get_spgemm_handle();
854   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
855 #endif
856 
857   PetscCall(PetscLogGpuTimeEnd());
858   PetscCall(MatSeqAIJKokkosModifyDevice(C));
859   /* shorter version of MatAssemblyEnd_SeqAIJ */
860   c = (Mat_SeqAIJ *)C->data;
861   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
862   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
863   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
864   c->reallocs         = 0;
865   C->info.mallocs     = 0;
866   C->info.nz_unneeded = 0;
867   C->assembled = C->was_assembled = PETSC_TRUE;
868   C->num_ass++;
869   PetscFunctionReturn(PETSC_SUCCESS);
870 }
871 
872 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
873 {
874   Mat_Product                 *product = C->product;
875   MatProductType               ptype;
876   Mat                          A, B;
877   bool                         transA, transB;
878   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
879   MatProductData_SeqAIJKokkos *pdata;
880   MPI_Comm                     comm;
881   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
882 
883   PetscFunctionBegin;
884   MatCheckProduct(C, 1);
885   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
886   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
887   A = product->A;
888   B = product->B;
889   PetscCall(MatSeqAIJKokkosSyncDevice(A));
890   PetscCall(MatSeqAIJKokkosSyncDevice(B));
891   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
892   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
893   csrmatA = akok->csrmat;
894   csrmatB = bkok->csrmat;
895 
896   ptype = product->type;
897   // Take advantage of the symmetry if true
898   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
899     ptype                                          = MATPRODUCT_AB;
900     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
901   }
902   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
903     ptype                                          = MATPRODUCT_AB;
904     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
905   }
906 
907   switch (ptype) {
908   case MATPRODUCT_AB:
909     transA = false;
910     transB = false;
911     PetscCall(MatSetBlockSizesFromMats(C, A, B));
912     break;
913   case MATPRODUCT_AtB:
914     transA = true;
915     transB = false;
916     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
917     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
918     break;
919   case MATPRODUCT_ABt:
920     transA = false;
921     transB = true;
922     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
923     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
924     break;
925   default:
926     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
927   }
928   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
929   pdata->reusesym = product->api_user;
930 
931   /* TODO: add command line options to select spgemm algorithms */
932   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
933 
934   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
935 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
936   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
937   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
938   #endif
939 #endif
940   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
941 
942   PetscCall(PetscLogGpuTimeBegin());
943   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
944   if (transA) {
945     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
946     transA = false;
947   }
948 
949   if (transB) {
950     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
951     transB = false;
952   }
953 
954   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
955   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
956     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
957     calling new Mat_SeqAIJKokkos().
958     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
959   */
960   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
961 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
962   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
963   auto spgemmHandle = pdata->kh.get_spgemm_handle();
964   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
965 #endif
966   PetscCall(PetscLogGpuTimeEnd());
967 
968   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
969   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
970   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
971   PetscFunctionReturn(PETSC_SUCCESS);
972 }
973 
974 /* handles sparse matrix matrix ops */
975 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
976 {
977   Mat_Product *product = mat->product;
978   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
979 
980   PetscFunctionBegin;
981   MatCheckProduct(mat, 1);
982   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
983   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
984   if (Biskok && Ciskok) {
985     switch (product->type) {
986     case MATPRODUCT_AB:
987     case MATPRODUCT_AtB:
988     case MATPRODUCT_ABt:
989       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
990       break;
991     case MATPRODUCT_PtAP:
992     case MATPRODUCT_RARt:
993     case MATPRODUCT_ABC:
994       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
995       break;
996     default:
997       break;
998     }
999   } else { /* fallback for AIJ */
1000     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1001   }
1002   PetscFunctionReturn(PETSC_SUCCESS);
1003 }
1004 
1005 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1006 {
1007   Mat_SeqAIJKokkos *aijkok;
1008 
1009   PetscFunctionBegin;
1010   PetscCall(PetscLogGpuTimeBegin());
1011   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1012   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1013   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1014   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1015   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1016   PetscCall(PetscLogGpuTimeEnd());
1017   PetscFunctionReturn(PETSC_SUCCESS);
1018 }
1019 
1020 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1021 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1022 {
1023   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1024 
1025   PetscFunctionBegin;
1026   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1027     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1028 
1029     PetscCall(PetscLogGpuTimeBegin());
1030     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1031     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1032     const auto &Aa     = aijkok->a_dual.view_device();
1033     const auto &Adiag  = aijkok->diag_dual.view_device();
1034     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1035     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1036     PetscCall(PetscLogGpuFlops(n));
1037     PetscCall(PetscLogGpuTimeEnd());
1038   } else { // need reassembly, very slow!
1039     PetscCall(MatShift_Basic(A, a));
1040   }
1041   PetscFunctionReturn(PETSC_SUCCESS);
1042 }
1043 
1044 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1045 {
1046   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1047 
1048   PetscFunctionBegin;
1049   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1050     ConstPetscScalarKokkosView dv;
1051     PetscInt                   n, nv;
1052 
1053     PetscCall(PetscLogGpuTimeBegin());
1054     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1055     PetscCall(VecGetKokkosView(D, &dv));
1056     PetscCall(VecGetLocalSize(D, &nv));
1057     n = PetscMin(Y->rmap->n, Y->cmap->n);
1058     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1059 
1060     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1061     const auto &Aa     = aijkok->a_dual.view_device();
1062     const auto &Adiag  = aijkok->diag_dual.view_device();
1063     PetscCallCXX(Kokkos::parallel_for(
1064       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1065         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1066         else Aa(Adiag(i)) += dv(i);
1067       }));
1068     PetscCall(VecRestoreKokkosView(D, &dv));
1069     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1070     PetscCall(PetscLogGpuFlops(n));
1071     PetscCall(PetscLogGpuTimeEnd());
1072   } else { // need reassembly, very slow!
1073     PetscCall(MatDiagonalSet_Default(Y, D, is));
1074   }
1075   PetscFunctionReturn(PETSC_SUCCESS);
1076 }
1077 
1078 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1079 {
1080   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1081   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1082   ConstPetscScalarKokkosView lv, rv;
1083 
1084   PetscFunctionBegin;
1085   PetscCall(PetscLogGpuTimeBegin());
1086   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1087   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1088   const auto &Aa     = aijkok->a_dual.view_device();
1089   const auto &Ai     = aijkok->i_dual.view_device();
1090   const auto &Aj     = aijkok->j_dual.view_device();
1091   if (ll) {
1092     PetscCall(VecGetLocalSize(ll, &m));
1093     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1094     PetscCall(VecGetKokkosView(ll, &lv));
1095     PetscCallCXX(Kokkos::parallel_for( // for each row
1096       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1097         PetscInt i   = t.league_rank(); // row i
1098         PetscInt len = Ai(i + 1) - Ai(i);
1099         // scale entries on the row
1100         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1101       }));
1102     PetscCall(VecRestoreKokkosView(ll, &lv));
1103     PetscCall(PetscLogGpuFlops(nz));
1104   }
1105   if (rr) {
1106     PetscCall(VecGetLocalSize(rr, &n));
1107     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1108     PetscCall(VecGetKokkosView(rr, &rv));
1109     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1110       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1111     PetscCall(VecRestoreKokkosView(rr, &lv));
1112     PetscCall(PetscLogGpuFlops(nz));
1113   }
1114   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1115   PetscCall(PetscLogGpuTimeEnd());
1116   PetscFunctionReturn(PETSC_SUCCESS);
1117 }
1118 
1119 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1120 {
1121   Mat_SeqAIJKokkos *aijkok;
1122 
1123   PetscFunctionBegin;
1124   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1125   if (aijkok) { /* Only zero the device if data is already there */
1126     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1127     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1128   } else { /* Might be preallocated but not assembled */
1129     PetscCall(MatZeroEntries_SeqAIJ(A));
1130   }
1131   PetscFunctionReturn(PETSC_SUCCESS);
1132 }
1133 
1134 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1135 {
1136   Mat_SeqAIJKokkos     *aijkok;
1137   PetscInt              n;
1138   PetscScalarKokkosView xv;
1139 
1140   PetscFunctionBegin;
1141   PetscCall(VecGetLocalSize(x, &n));
1142   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1143   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1144 
1145   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1146   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1147 
1148   const auto &Aa    = aijkok->a_dual.view_device();
1149   const auto &Ai    = aijkok->i_dual.view_device();
1150   const auto &Adiag = aijkok->diag_dual.view_device();
1151 
1152   PetscCall(VecGetKokkosViewWrite(x, &xv));
1153   Kokkos::parallel_for(
1154     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1155       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1156       else xv(i) = 0;
1157     });
1158   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1159   PetscFunctionReturn(PETSC_SUCCESS);
1160 }
1161 
1162 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1163 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1164 {
1165   Mat_SeqAIJKokkos *aijkok;
1166 
1167   PetscFunctionBegin;
1168   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1169   PetscAssertPointer(kv, 2);
1170   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1171   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1172   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1173   *kv    = aijkok->a_dual.view_device();
1174   PetscFunctionReturn(PETSC_SUCCESS);
1175 }
1176 
1177 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1178 {
1179   PetscFunctionBegin;
1180   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1181   PetscAssertPointer(kv, 2);
1182   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1183   PetscFunctionReturn(PETSC_SUCCESS);
1184 }
1185 
1186 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1187 {
1188   Mat_SeqAIJKokkos *aijkok;
1189 
1190   PetscFunctionBegin;
1191   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1192   PetscAssertPointer(kv, 2);
1193   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1194   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1195   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1196   *kv    = aijkok->a_dual.view_device();
1197   PetscFunctionReturn(PETSC_SUCCESS);
1198 }
1199 
1200 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1201 {
1202   PetscFunctionBegin;
1203   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1204   PetscAssertPointer(kv, 2);
1205   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1206   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1207   PetscFunctionReturn(PETSC_SUCCESS);
1208 }
1209 
1210 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1211 {
1212   Mat_SeqAIJKokkos *aijkok;
1213 
1214   PetscFunctionBegin;
1215   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1216   PetscAssertPointer(kv, 2);
1217   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1218   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1219   *kv    = aijkok->a_dual.view_device();
1220   PetscFunctionReturn(PETSC_SUCCESS);
1221 }
1222 
1223 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1224 {
1225   PetscFunctionBegin;
1226   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1227   PetscAssertPointer(kv, 2);
1228   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1229   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1230   PetscFunctionReturn(PETSC_SUCCESS);
1231 }
1232 
1233 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1234 {
1235   Mat_SeqAIJKokkos *akok;
1236 
1237   PetscFunctionBegin;
1238   auto exec = PetscGetKokkosExecutionSpace();
1239   // Create host copies of the input aij
1240   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1241   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1242   // Don't copy the vals to the host now
1243   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1244 
1245   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1246   // Note we have modified device data so it will copy lazily
1247   a_dual.modify_device();
1248   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1249   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1250 
1251   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1252   PetscCall(MatCreate(comm, A));
1253   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1254   PetscFunctionReturn(PETSC_SUCCESS);
1255 }
1256 
1257 /* Computes Y += alpha X */
1258 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1259 {
1260   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1261   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1262   ConstMatScalarKokkosView Xa;
1263   MatScalarKokkosView      Ya;
1264   auto                     exec = PetscGetKokkosExecutionSpace();
1265 
1266   PetscFunctionBegin;
1267   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1268   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1269   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1270   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1271   PetscCall(PetscLogGpuTimeBegin());
1272 
1273   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1274     PetscBool e;
1275     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1276     if (e) {
1277       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1278       if (e) pattern = SAME_NONZERO_PATTERN;
1279     }
1280   }
1281 
1282   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1283     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1284     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1285     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1286   */
1287   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1288   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1289   Xa   = xkok->a_dual.view_device();
1290   Ya   = ykok->a_dual.view_device();
1291 
1292   if (pattern == SAME_NONZERO_PATTERN) {
1293     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1294     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1295   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1296     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1297     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1298 
1299     Kokkos::parallel_for(
1300       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1301         PetscInt i = t.league_rank(); // row i
1302         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1303           // Only one thread works in a team
1304           PetscInt p, q = Yi(i);
1305           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1306             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1307             if (Xj(p) == Yj(q)) {                        // Found it
1308               Ya(q) += alpha * Xa(p);
1309               q++;
1310             } else {
1311             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1312             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1313 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1314               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1315 #else
1316               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1317 #endif
1318             }
1319           }
1320         });
1321       });
1322     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1323   } else { // different nonzero patterns
1324     Mat             Z;
1325     KokkosCsrMatrix zcsr;
1326     KernelHandle    kh;
1327     kh.create_spadd_handle(true); // X, Y are sorted
1328     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1329     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1330     zkok = new Mat_SeqAIJKokkos(zcsr);
1331     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1332     PetscCall(MatHeaderReplace(Y, &Z));
1333     kh.destroy_spadd_handle();
1334   }
1335   PetscCall(PetscLogGpuTimeEnd());
1336   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1337   PetscFunctionReturn(PETSC_SUCCESS);
1338 }
1339 
1340 struct MatCOOStruct_SeqAIJKokkos {
1341   PetscCount           n;
1342   PetscCount           Atot;
1343   PetscInt             nz;
1344   PetscCountKokkosView jmap;
1345   PetscCountKokkosView perm;
1346 
1347   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1348   {
1349     nz   = coo_h->nz;
1350     n    = coo_h->n;
1351     Atot = coo_h->Atot;
1352     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1353     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1354   }
1355 };
1356 
1357 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1358 {
1359   PetscFunctionBegin;
1360   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1361   PetscFunctionReturn(PETSC_SUCCESS);
1362 }
1363 
1364 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1365 {
1366   Mat_SeqAIJKokkos          *akok;
1367   Mat_SeqAIJ                *aseq;
1368   PetscContainer             container_h;
1369   MatCOOStruct_SeqAIJ       *coo_h;
1370   MatCOOStruct_SeqAIJKokkos *coo_d;
1371 
1372   PetscFunctionBegin;
1373   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1374   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1375   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1376   delete akok;
1377   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1378   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1379 
1380   // Copy the COO struct to device
1381   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1382   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1383   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1384 
1385   // Put the COO struct in a container and then attach that to the matrix
1386   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1387   PetscFunctionReturn(PETSC_SUCCESS);
1388 }
1389 
1390 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1391 {
1392   MatScalarKokkosView        Aa;
1393   ConstMatScalarKokkosView   kv;
1394   PetscMemType               memtype;
1395   PetscContainer             container;
1396   MatCOOStruct_SeqAIJKokkos *coo;
1397 
1398   PetscFunctionBegin;
1399   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1400   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1401 
1402   const auto &n    = coo->n;
1403   const auto &Annz = coo->nz;
1404   const auto &jmap = coo->jmap;
1405   const auto &perm = coo->perm;
1406 
1407   PetscCall(PetscGetMemType(v, &memtype));
1408   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1409     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1410   } else {
1411     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1412   }
1413 
1414   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1415   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1416 
1417   PetscCall(PetscLogGpuTimeBegin());
1418   Kokkos::parallel_for(
1419     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1420       PetscScalar sum = 0.0;
1421       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1422       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1423     });
1424   PetscCall(PetscLogGpuTimeEnd());
1425 
1426   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1427   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1428   PetscFunctionReturn(PETSC_SUCCESS);
1429 }
1430 
1431 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1432 {
1433   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1434 
1435   PetscFunctionBegin;
1436   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1437   A->boundtocpu  = PETSC_FALSE;
1438 
1439   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1440   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1441   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1442   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1443   A->ops->scale                     = MatScale_SeqAIJKokkos;
1444   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1445   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1446   A->ops->mult                      = MatMult_SeqAIJKokkos;
1447   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1448   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1449   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1450   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1451   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1452   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1453   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1454   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1455   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1456   A->ops->shift                     = MatShift_SeqAIJKokkos;
1457   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1458   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1459   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1460   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1461   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1462   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1463   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1464   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1465   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1466 
1467   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1468   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1469 #if defined(PETSC_HAVE_HYPRE)
1470   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1471 #endif
1472   PetscFunctionReturn(PETSC_SUCCESS);
1473 }
1474 
1475 /*
1476    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1477 
1478   Input Parameters:
1479 +  A       - the MATSEQAIJKOKKOS matrix
1480 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1481 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1482 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1483 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1484 
1485   Output Parameter:
1486 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1487 */
1488 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1489 {
1490   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1491   PetscInt          nblocks = bs.extent(0) - 1;
1492 
1493   PetscFunctionBegin;
1494   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1495 
1496   // Pull out the diagonal blocks of the matrix and then invert the blocks
1497   auto Aa    = akok->a_dual.view_device();
1498   auto Ai    = akok->i_dual.view_device();
1499   auto Aj    = akok->j_dual.view_device();
1500   auto Adiag = akok->diag_dual.view_device();
1501   // TODO: how to tune the team size?
1502 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1503   auto ts = Kokkos::AUTO();
1504 #else
1505   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1506 #endif
1507   PetscCallCXX(Kokkos::parallel_for(
1508     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1509       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1510       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1511       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1512       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1513       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1514 
1515       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1516         PetscInt i = rstart + r;                                                            // i-th row in A
1517 
1518         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1519           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1520 
1521           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1522             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1523               B(r, c) = 0.0;
1524             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1525               B(r, c) = Aa(first + c);
1526             } else { // this entry does not show up in the CSR
1527               B(r, c) = 0.0;
1528             }
1529           }
1530         } else { // rare case that the diagonal does not exist
1531           const PetscInt begin = Ai(i);
1532           const PetscInt end   = Ai(i + 1);
1533           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1534           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1535             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1536             else if (Aj(j) >= rstart + m) break;
1537           }
1538         }
1539       });
1540 
1541       // LU-decompose B (w/o pivoting) and then invert B
1542       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1543       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1544     }));
1545   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1546   PetscFunctionReturn(PETSC_SUCCESS);
1547 }
1548 
1549 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1550 {
1551   Mat_SeqAIJ *aseq;
1552   PetscInt    i, m, n;
1553   auto        exec = PetscGetKokkosExecutionSpace();
1554 
1555   PetscFunctionBegin;
1556   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1557 
1558   m = akok->nrows();
1559   n = akok->ncols();
1560   PetscCall(MatSetSizes(A, m, n, m, n));
1561   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1562 
1563   /* Set up data structures of A as a MATSEQAIJ */
1564   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1565   aseq = (Mat_SeqAIJ *)A->data;
1566 
1567   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1568   PetscCallCXX(akok->j_dual.sync_host(exec));
1569   PetscCallCXX(exec.fence());
1570 
1571   aseq->i       = akok->i_host_data();
1572   aseq->j       = akok->j_host_data();
1573   aseq->a       = akok->a_host_data();
1574   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1575   aseq->free_a  = PETSC_FALSE;
1576   aseq->free_ij = PETSC_FALSE;
1577   aseq->nz      = akok->nnz();
1578   aseq->maxnz   = aseq->nz;
1579 
1580   PetscCall(PetscMalloc1(m, &aseq->imax));
1581   PetscCall(PetscMalloc1(m, &aseq->ilen));
1582   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1583 
1584   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1585   akok->nonzerostate = A->nonzerostate;
1586   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1587   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1588   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1589   PetscFunctionReturn(PETSC_SUCCESS);
1590 }
1591 
1592 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1593 {
1594   PetscFunctionBegin;
1595   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1596   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1597   PetscFunctionReturn(PETSC_SUCCESS);
1598 }
1599 
1600 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1601 {
1602   Mat_SeqAIJKokkos *akok;
1603 
1604   PetscFunctionBegin;
1605   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1606   PetscCall(MatCreate(comm, A));
1607   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1608   PetscFunctionReturn(PETSC_SUCCESS);
1609 }
1610 
1611 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1612 
1613    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1614  */
1615 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1616 {
1617   PetscFunctionBegin;
1618   PetscCall(MatCreate(comm, A));
1619   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1620   PetscFunctionReturn(PETSC_SUCCESS);
1621 }
1622 
1623 /*@C
1624   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1625   (the default parallel PETSc format). This matrix will ultimately be handled by
1626   Kokkos for calculations.
1627 
1628   Collective
1629 
1630   Input Parameters:
1631 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1632 . m    - number of rows
1633 . n    - number of columns
1634 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1635 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1636 
1637   Output Parameter:
1638 . A - the matrix
1639 
1640   Level: intermediate
1641 
1642   Notes:
1643   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1644   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1645   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1646 
1647   The AIJ format, also called
1648   compressed row storage, is fully compatible with standard Fortran
1649   storage.  That is, the stored row and column indices can begin at
1650   either one (as in Fortran) or zero.
1651 
1652   Specify the preallocated storage with either `nz` or `nnz` (not both).
1653   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1654   allocation.
1655 
1656 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1657 @*/
1658 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1659 {
1660   PetscFunctionBegin;
1661   PetscCall(PetscKokkosInitializeCheck());
1662   PetscCall(MatCreate(comm, A));
1663   PetscCall(MatSetSizes(*A, m, n, m, n));
1664   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1665   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1666   PetscFunctionReturn(PETSC_SUCCESS);
1667 }
1668 
1669 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1670 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1671 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1672 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1673 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1674 {
1675   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1676   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1677   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1678 
1679   PetscFunctionBegin;
1680   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1681     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1682     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1683     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1684   }
1685   PetscFunctionReturn(PETSC_SUCCESS);
1686 }
1687 
1688 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1689 {
1690   const PetscInt              n         = A->rmap->n;
1691   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1692   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1693   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1694 
1695   PetscFunctionBegin;
1696   if (!factors->transpose_updated) {
1697     if (has_upper) {
1698       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1699         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1700         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1701         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1702       }
1703 
1704       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1705         if (!factors->U) {
1706           Mat_SeqAIJ *seq;
1707 
1708           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1709           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1710 
1711           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1712           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1713           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1714           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1715         } else {
1716           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1717         }
1718         // Copy Ut from host to device
1719         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1720         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1721         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1722       } else { // If U was computed on device, we also compute the transpose there
1723         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1724         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1725                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1726                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1727                                                                                                                                                                                                                                factors->aUt_d));
1728         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1729       }
1730       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1731     }
1732 
1733     // do the same for L with LU
1734     if (has_lower) {
1735       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1736         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1737         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1738         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1739       }
1740 
1741       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1742         if (!factors->L) {
1743           Mat_SeqAIJ *seq;
1744 
1745           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1746           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1747 
1748           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1749           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1750           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1751           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1752         } else {
1753           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1754         }
1755         // Copy Lt from host to device
1756         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1757         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1758         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1759       } else { // If L was computed on device, we also compute the transpose there
1760         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1761         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1762                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1763                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1764                                                                                                                                                                                                                                factors->aLt_d));
1765         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1766       }
1767       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1768     }
1769 
1770     factors->transpose_updated = PETSC_TRUE;
1771   }
1772   PetscFunctionReturn(PETSC_SUCCESS);
1773 }
1774 
1775 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1776 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1777 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1778 {
1779   auto                        exec    = PetscGetKokkosExecutionSpace();
1780   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1781   PetscInt                    m       = A->rmap->n;
1782   PetscScalarKokkosView       D       = factors->D_d;
1783   PetscScalarKokkosView       X, Y, B; // alias
1784   ConstPetscScalarKokkosView  b;
1785   PetscScalarKokkosView       x;
1786   PetscIntKokkosView         &rowperm  = factors->rowperm;
1787   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1788 
1789   PetscFunctionBegin;
1790   PetscCall(PetscLogGpuTimeBegin());
1791   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1792   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1793   PetscCall(VecGetKokkosView(bb, &b));
1794   PetscCall(VecGetKokkosViewWrite(xx, &x));
1795 
1796   // Solve U^T Y = B
1797   if (identity) { // Reorder b with the row permutation
1798     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1799     Y = factors->workVector;
1800   } else {
1801     B = factors->workVector;
1802     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1803     Y = x;
1804   }
1805   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1806 
1807   // Solve diag(D) Y' = Y.
1808   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1809   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1810 
1811   // Solve UX = Y
1812   if (identity) {
1813     X = x;
1814   } else {
1815     X = factors->workVector; // B is not needed anymore
1816   }
1817   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1818 
1819   // Reorder X with the inverse column (row) permutation
1820   if (!identity) {
1821     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1822   }
1823 
1824   PetscCall(VecRestoreKokkosView(bb, &b));
1825   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1826   PetscCall(PetscLogGpuTimeEnd());
1827   PetscFunctionReturn(PETSC_SUCCESS);
1828 }
1829 
1830 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1831 // R and C are represented by rowperm and colperm in factors.
1832 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1833 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1834 {
1835   auto                        exec    = PetscGetKokkosExecutionSpace();
1836   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1837   PetscInt                    m       = A->rmap->n;
1838   PetscScalarKokkosView       X, Y, B; // alias
1839   ConstPetscScalarKokkosView  b;
1840   PetscScalarKokkosView       x;
1841   PetscIntKokkosView         &rowperm      = factors->rowperm;
1842   PetscIntKokkosView         &colperm      = factors->colperm;
1843   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1844   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1845 
1846   PetscFunctionBegin;
1847   PetscCall(PetscLogGpuTimeBegin());
1848   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1849   PetscCall(VecGetKokkosView(bb, &b));
1850   PetscCall(VecGetKokkosViewWrite(xx, &x));
1851 
1852   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1853   if (row_identity) {
1854     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1855     Y = factors->workVector;
1856   } else {
1857     B = factors->workVector;
1858     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1859     Y = x;
1860   }
1861   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1862 
1863   // Solve U C^- x = Y
1864   if (col_identity) {
1865     X = x;
1866   } else {
1867     X = factors->workVector;
1868   }
1869   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1870 
1871   // x = C X; Reorder X with the inverse col permutation
1872   if (!col_identity) {
1873     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1874   }
1875 
1876   PetscCall(VecRestoreKokkosView(bb, &b));
1877   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1878   PetscCall(PetscLogGpuTimeEnd());
1879   PetscFunctionReturn(PETSC_SUCCESS);
1880 }
1881 
1882 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1883 // R and C are represented by rowperm and colperm in factors.
1884 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1885 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1886 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1887 {
1888   auto                        exec    = PetscGetKokkosExecutionSpace();
1889   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1890   PetscInt                    m       = A->rmap->n;
1891   PetscScalarKokkosView       X, Y, B; // alias
1892   ConstPetscScalarKokkosView  b;
1893   PetscScalarKokkosView       x;
1894   PetscIntKokkosView         &rowperm      = factors->rowperm;
1895   PetscIntKokkosView         &colperm      = factors->colperm;
1896   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1897   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1898 
1899   PetscFunctionBegin;
1900   PetscCall(PetscLogGpuTimeBegin());
1901   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1902   PetscCall(VecGetKokkosView(bb, &b));
1903   PetscCall(VecGetKokkosViewWrite(xx, &x));
1904 
1905   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1906   if (col_identity) { // Reorder b with the col permutation
1907     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1908     Y = factors->workVector;
1909   } else {
1910     B = factors->workVector;
1911     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1912     Y = x;
1913   }
1914   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1915 
1916   // Solve L^T X = Y
1917   if (row_identity) {
1918     X = x;
1919   } else {
1920     X = factors->workVector;
1921   }
1922   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1923 
1924   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1925   if (!row_identity) {
1926     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1927   }
1928 
1929   PetscCall(VecRestoreKokkosView(bb, &b));
1930   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1931   PetscCall(PetscLogGpuTimeEnd());
1932   PetscFunctionReturn(PETSC_SUCCESS);
1933 }
1934 
1935 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1936 {
1937   PetscFunctionBegin;
1938   PetscCall(MatSeqAIJKokkosSyncHost(A));
1939   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1940 
1941   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1942     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1943     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1944     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1945     const MatScalar            *Ba = b->a;
1946     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1947 
1948     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1949       // Allocate memory and copy the structure
1950       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1951       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1952       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1953       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1954       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1955       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1956 
1957       PetscInt *Li = factors->iL_h.data();
1958       PetscInt *Lj = factors->jL_h.data();
1959       PetscInt *Ui = factors->iU_h.data();
1960       PetscInt *Uj = factors->jU_h.data();
1961 
1962       Li[0] = Ui[0] = 0;
1963       for (PetscInt i = 0; i < m; i++) {
1964         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1965         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1966 
1967         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1968         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1969 
1970         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1971         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1972 
1973         Li[i + 1] = Li[i] + llen + 1;
1974         Ui[i + 1] = Ui[i] + ulen;
1975       }
1976 
1977       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1978       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1979       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1980       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1981       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1982       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1983 
1984       // Copy row/col permutation to device
1985       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1986       PetscBool row_identity;
1987       PetscCall(ISIdentity(rowperm, &row_identity));
1988       if (!row_identity) {
1989         const PetscInt *ip;
1990 
1991         PetscCall(ISGetIndices(rowperm, &ip));
1992         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1993         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1994         PetscCall(ISRestoreIndices(rowperm, &ip));
1995         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1996       }
1997 
1998       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1999       PetscBool col_identity;
2000       PetscCall(ISIdentity(colperm, &col_identity));
2001       if (!col_identity) {
2002         const PetscInt *ip;
2003 
2004         PetscCall(ISGetIndices(colperm, &ip));
2005         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2006         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2007         PetscCall(ISRestoreIndices(colperm, &ip));
2008         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2009       }
2010 
2011       /* Create sptrsv handles for L, U and their transpose */
2012 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2013       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2014 #else
2015       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2016 #endif
2017       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2018       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2019       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2020       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2021     }
2022 
2023     // Copy the value
2024     for (PetscInt i = 0; i < m; i++) {
2025       PetscInt        llen = Bi[i + 1] - Bi[i];
2026       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2027       const PetscInt *Li   = factors->iL_h.data();
2028       const PetscInt *Ui   = factors->iU_h.data();
2029 
2030       PetscScalar *La = factors->aL_h.data();
2031       PetscScalar *Ua = factors->aU_h.data();
2032 
2033       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2034       La[Li[i] + llen] = 1.0;                      // diagonal entry
2035 
2036       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2037       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2038     }
2039 
2040     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2041     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2042     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2043     factors->transpose_updated         = PETSC_FALSE;
2044     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2045 
2046     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2047     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2048   }
2049 
2050   B->ops->matsolve          = NULL;
2051   B->ops->matsolvetranspose = NULL;
2052   PetscFunctionReturn(PETSC_SUCCESS);
2053 }
2054 
2055 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2056 {
2057   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2058   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2059   PetscInt                    fill_lev = info->levels;
2060 
2061   PetscFunctionBegin;
2062   PetscCall(PetscLogGpuTimeBegin());
2063   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2064   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2065 
2066   auto a_d = aijkok->a_dual.view_device();
2067   auto i_d = aijkok->i_dual.view_device();
2068   auto j_d = aijkok->j_dual.view_device();
2069 
2070   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2071 
2072   B->assembled              = PETSC_TRUE;
2073   B->preallocated           = PETSC_TRUE;
2074   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2075   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2076   B->ops->matsolve          = NULL;
2077   B->ops->matsolvetranspose = NULL;
2078 
2079   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2080   factors->transpose_updated         = PETSC_FALSE;
2081   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2082   /* TODO: log flops, but how to know that? */
2083   PetscCall(PetscLogGpuTimeEnd());
2084   PetscFunctionReturn(PETSC_SUCCESS);
2085 }
2086 
2087 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2088 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2089 {
2090   Mat_SeqAIJKokkos           *aijkok;
2091   Mat_SeqAIJ                 *b;
2092   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2093   PetscInt                    fill_lev = info->levels;
2094   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2095   PetscInt                    n        = A->rmap->n;
2096 
2097   PetscFunctionBegin;
2098   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2099   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2100 
2101   /* Create a spiluk handle and then do symbolic factorization */
2102   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2103   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2104 
2105   auto spiluk_handle = factors->kh.get_spiluk_handle();
2106 
2107   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2108   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2109   Kokkos::realloc(factors->iU_d, n + 1);
2110   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2111 
2112   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2113   auto i_d = aijkok->i_dual.view_device();
2114   auto j_d = aijkok->j_dual.view_device();
2115   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2116   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2117 
2118   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2119   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2120   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2121   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2122 
2123   /* TODO: add options to select sptrsv algorithms */
2124   /* Create sptrsv handles for L, U and their transpose */
2125 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2126   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2127 #else
2128   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2129 #endif
2130 
2131   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2132   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2133   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2134   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2135 
2136   /* Fill fields of the factor matrix B */
2137   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2138   b     = (Mat_SeqAIJ *)B->data;
2139   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2140   B->info.fill_ratio_given  = info->fill;
2141   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2142 
2143   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2144   PetscFunctionReturn(PETSC_SUCCESS);
2145 }
2146 
2147 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2148 {
2149   PetscFunctionBegin;
2150   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2151   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2152   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2153   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2154   PetscFunctionReturn(PETSC_SUCCESS);
2155 }
2156 
2157 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2158 {
2159   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2160 
2161   PetscFunctionBegin;
2162   if (!info->factoronhost) {
2163     PetscCall(ISIdentity(isrow, &row_identity));
2164     PetscCall(ISIdentity(iscol, &col_identity));
2165   }
2166 
2167   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2168   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2169 
2170   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2171     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2172   } else {
2173     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2174     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2175   }
2176   PetscFunctionReturn(PETSC_SUCCESS);
2177 }
2178 
2179 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2180 {
2181   PetscFunctionBegin;
2182   PetscCall(MatSeqAIJKokkosSyncHost(A));
2183   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2184 
2185   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2186     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2187     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2188     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2189     const MatScalar            *Ba = b->a;
2190     PetscInt                    m  = B->rmap->n;
2191 
2192     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2193       // Allocate memory and copy the structure
2194       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2195       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2196       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2197       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2198       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2199       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2200 
2201       // Build jU_h from the skewed Aj
2202       PetscInt *Uj = factors->jU_h.data();
2203       for (PetscInt i = 0; i < m; i++) {
2204         PetscInt ulen = Bi[i + 1] - Bi[i];
2205         Uj[Bi[i]]     = i;                                              // diagonal entry
2206         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2207       }
2208 
2209       // Copy iU, jU to device
2210       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2211       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2212 
2213       // Copy row/col permutation to device
2214       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2215       PetscBool row_identity;
2216       PetscCall(ISIdentity(rowperm, &row_identity));
2217       if (!row_identity) {
2218         const PetscInt *ip;
2219 
2220         PetscCall(ISGetIndices(rowperm, &ip));
2221         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2222         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2223         PetscCall(ISRestoreIndices(rowperm, &ip));
2224         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2225       }
2226 
2227       // Create sptrsv handles for U and U^T
2228 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2229       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2230 #else
2231       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2232 #endif
2233       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2234       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2235     }
2236     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2237     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2238     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2239 
2240     // Copy the value
2241     PetscScalar *Ua = factors->aU_h.data();
2242     PetscScalar *D  = factors->D_h.data();
2243     for (PetscInt i = 0; i < m; i++) {
2244       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2245       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2246       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2247     }
2248     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2249     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2250 
2251     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2252     factors->transpose_updated         = PETSC_FALSE;
2253   }
2254 
2255   B->ops->matsolve          = NULL;
2256   B->ops->matsolvetranspose = NULL;
2257   PetscFunctionReturn(PETSC_SUCCESS);
2258 }
2259 
2260 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2261 {
2262   PetscFunctionBegin;
2263   if (info->solveonhost) {
2264     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2265     PetscCall(MatSetType(B, MATSEQSBAIJ));
2266     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2267   }
2268 
2269   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2270 
2271   if (!info->solveonhost) {
2272     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2273     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2274     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2275     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2276   }
2277   PetscFunctionReturn(PETSC_SUCCESS);
2278 }
2279 
2280 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2281 {
2282   PetscFunctionBegin;
2283   if (info->solveonhost) {
2284     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2285     PetscCall(MatSetType(B, MATSEQSBAIJ));
2286     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2287   }
2288 
2289   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2290 
2291   if (!info->solveonhost) {
2292     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2293     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2294     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2295     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2296   }
2297   PetscFunctionReturn(PETSC_SUCCESS);
2298 }
2299 
2300 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2301 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2302 {
2303   PetscFunctionBegin;
2304   *type = MATSOLVERKOKKOS;
2305   PetscFunctionReturn(PETSC_SUCCESS);
2306 }
2307 
2308 /*MC
2309   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2310   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2311 
2312   Level: beginner
2313 
2314 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2315 M*/
2316 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2317 {
2318   PetscInt n = A->rmap->n;
2319   MPI_Comm comm;
2320 
2321   PetscFunctionBegin;
2322   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2323   PetscCall(MatCreate(comm, B));
2324   PetscCall(MatSetSizes(*B, n, n, n, n));
2325   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2326   (*B)->factortype = ftype;
2327   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2328   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2329   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2330 
2331   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2332     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2333     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2334     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2335     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2336     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2337   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2338     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2339     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2340     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2341     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2342   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2343 
2344   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2345   (*B)->canuseordering = PETSC_TRUE;
2346   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2347   PetscFunctionReturn(PETSC_SUCCESS);
2348 }
2349 
2350 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2351 {
2352   PetscFunctionBegin;
2353   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2354   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2355   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2356   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2357   PetscFunctionReturn(PETSC_SUCCESS);
2358 }
2359 
2360 /* Utility to print out a KokkosCsrMatrix for debugging */
2361 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2362 {
2363   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2364   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2365   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2366   const PetscInt    *i  = iv.data();
2367   const PetscInt    *j  = jv.data();
2368   const PetscScalar *a  = av.data();
2369   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2370 
2371   PetscFunctionBegin;
2372   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2373   for (PetscInt k = 0; k < m; k++) {
2374     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2375     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2376     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2377   }
2378   PetscFunctionReturn(PETSC_SUCCESS);
2379 }
2380