xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision c36a59bee2e66f2e80e3e9e7603aedea4197893c)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
18 
19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
20   #include <KokkosSparse_Utils.hpp>
21 using KokkosSparse::sort_crs_matrix;
22 using KokkosSparse::Impl::transpose_matrix;
23 #else
24   #include <KokkosKernels_Sorting.hpp>
25 using KokkosKernels::sort_crs_matrix;
26 using KokkosKernels::Impl::transpose_matrix;
27 #endif
28 
29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
30 
31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33    In the latter case, it is important to set a_dual's sync state correctly.
34  */
35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
36 {
37   Mat_SeqAIJ       *aijseq;
38   Mat_SeqAIJKokkos *aijkok;
39 
40   PetscFunctionBegin;
41   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
42   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
43 
44   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
45   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
46 
47   /* If aijkok does not exist, we just copy i, j to device.
48      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
49      In both cases, we build a new aijkok structure.
50   */
51   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
52     delete aijkok;
53     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
54     A->spptr = aijkok;
55   }
56 
57   PetscFunctionReturn(PETSC_SUCCESS);
58 }
59 
60 /* Sync CSR data to device if not yet */
61 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
62 {
63   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
64 
65   PetscFunctionBegin;
66   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
67   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
68   if (aijkok->a_dual.need_sync_device()) {
69     aijkok->a_dual.sync_device();
70     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
71     aijkok->hermitian_updated = PETSC_FALSE;
72   }
73   PetscFunctionReturn(PETSC_SUCCESS);
74 }
75 
76 /* Mark the CSR data on device as modified */
77 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
78 {
79   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
80 
81   PetscFunctionBegin;
82   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
83   aijkok->a_dual.clear_sync_state();
84   aijkok->a_dual.modify_device();
85   aijkok->transpose_updated = PETSC_FALSE;
86   aijkok->hermitian_updated = PETSC_FALSE;
87   PetscCall(MatSeqAIJInvalidateDiagonal(A));
88   PetscCall(PetscObjectStateIncrease((PetscObject)A));
89   PetscFunctionReturn(PETSC_SUCCESS);
90 }
91 
92 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
93 {
94   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
95 
96   PetscFunctionBegin;
97   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
98   /* We do not expect one needs factors on host  */
99   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
100   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
101   aijkok->a_dual.sync_host();
102   PetscFunctionReturn(PETSC_SUCCESS);
103 }
104 
105 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
106 {
107   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
108 
109   PetscFunctionBegin;
110   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
111     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
112     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
113     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
114   */
115   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
116     aijkok->a_dual.sync_host();
117     *array = aijkok->a_dual.view_host().data();
118   } else { /* Happens when calling MatSetValues on a newly created matrix */
119     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
120   }
121   PetscFunctionReturn(PETSC_SUCCESS);
122 }
123 
124 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127 
128   PetscFunctionBegin;
129   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
130   PetscFunctionReturn(PETSC_SUCCESS);
131 }
132 
133 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
134 {
135   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
136 
137   PetscFunctionBegin;
138   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
139     aijkok->a_dual.sync_host();
140     *array = aijkok->a_dual.view_host().data();
141   } else {
142     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
143   }
144   PetscFunctionReturn(PETSC_SUCCESS);
145 }
146 
147 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
148 {
149   PetscFunctionBegin;
150   *array = NULL;
151   PetscFunctionReturn(PETSC_SUCCESS);
152 }
153 
154 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
155 {
156   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
157 
158   PetscFunctionBegin;
159   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
160     *array = aijkok->a_dual.view_host().data();
161   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
162     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
163   }
164   PetscFunctionReturn(PETSC_SUCCESS);
165 }
166 
167 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
168 {
169   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
170 
171   PetscFunctionBegin;
172   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
173     aijkok->a_dual.clear_sync_state();
174     aijkok->a_dual.modify_host();
175   }
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
180 {
181   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
182 
183   PetscFunctionBegin;
184   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
185 
186   if (i) *i = aijkok->i_device_data();
187   if (j) *j = aijkok->j_device_data();
188   if (a) {
189     aijkok->a_dual.sync_device();
190     *a = aijkok->a_device_data();
191   }
192   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
193   PetscFunctionReturn(PETSC_SUCCESS);
194 }
195 
196 /*
197   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
198 
199   Input Parameter:
200 .  A       - the MATSEQAIJKOKKOS matrix
201 
202   Output Parameters:
203 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
204 -  T_d    - the transpose on device, whose value array is allocated but not initialized
205 */
206 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
207 {
208   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
209   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
210   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
211   MatRowMapKokkosViewHost Ti_h("Ti", n + 1);
212   MatRowMapType          *Ti = Ti_h.data();
213   MatColIdxKokkosViewHost Tj_h("Tj", nz);
214   MatRowMapKokkosViewHost perm_h("permutation", nz);
215   PetscInt               *Tj   = Tj_h.data();
216   PetscInt               *perm = perm_h.data();
217   PetscInt               *offset;
218 
219   PetscFunctionBegin;
220   // Populate Ti
221   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
222   Ti++;
223   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
224   Ti--;
225   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
226 
227   // Populate Tj and the permutation array
228   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
229   for (PetscInt i = 0; i < m; i++) {
230     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
231       PetscInt r    = Aj[j];                       // row r of T
232       PetscInt disp = Ti[r] + offset[r];
233 
234       Tj[disp]   = i; // col i of T
235       perm[disp] = j;
236       offset[r]++;
237     }
238   }
239   PetscCall(PetscFree(offset));
240 
241   // Sort each row of T, along with the permutation array
242   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
243 
244   // Output perm and T on device
245   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
246   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
247   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
248   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
249   PetscFunctionReturn(PETSC_SUCCESS);
250 }
251 
252 // Generate the transpose on device and cache it internally
253 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
254 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
255 {
256   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
257   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
258   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
259   KokkosCsrMatrix  &T = akok->csrmatT;
260 
261   PetscFunctionBegin;
262   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
263   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device
264 
265   const auto &Aa = akok->a_dual.view_device();
266 
267   if (A->symmetric == PETSC_BOOL3_TRUE) {
268     *csrmatT = akok->csrmat;
269   } else {
270     // See if we already have a cached transpose and its value is up to date
271     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
272       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
273         const auto &perm = akok->transpose_perm; // get the permutation array
274         auto       &Ta   = T.values;
275 
276         PetscCallCXX(Kokkos::parallel_for(
277           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
278       }
279     } else { // Generate T of size n x m for the first time
280       MatRowMapKokkosView perm;
281 
282       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
283       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
284       PetscCallCXX(Kokkos::parallel_for(
285         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
286     }
287     akok->transpose_updated = PETSC_TRUE;
288     *csrmatT                = akok->csrmatT;
289   }
290   PetscFunctionReturn(PETSC_SUCCESS);
291 }
292 
293 // Generate the Hermitian on device and cache it internally
294 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
295 {
296   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
297   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
298   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
299   KokkosCsrMatrix  &T = akok->csrmatH;
300 
301   PetscFunctionBegin;
302   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
303   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
304 
305   const auto &Aa = akok->a_dual.view_device();
306 
307   if (A->hermitian == PETSC_BOOL3_TRUE) {
308     *csrmatH = akok->csrmat;
309   } else {
310     // See if we already have a cached hermitian and its value is up to date
311     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
312       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
313         const auto &perm = akok->transpose_perm; // get the permutation array
314         auto       &Ta   = T.values;
315 
316         PetscCallCXX(Kokkos::parallel_for(
317           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
318       }
319     } else { // Generate T of size n x m for the first time
320       MatRowMapKokkosView perm;
321 
322       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
323       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
324       PetscCallCXX(Kokkos::parallel_for(
325         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
326     }
327     akok->hermitian_updated = PETSC_TRUE;
328     *csrmatH                = akok->csrmatH;
329   }
330   PetscFunctionReturn(PETSC_SUCCESS);
331 }
332 
333 /* y = A x */
334 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
335 {
336   Mat_SeqAIJKokkos          *aijkok;
337   ConstPetscScalarKokkosView xv;
338   PetscScalarKokkosView      yv;
339 
340   PetscFunctionBegin;
341   PetscCall(PetscLogGpuTimeBegin());
342   PetscCall(MatSeqAIJKokkosSyncDevice(A));
343   PetscCall(VecGetKokkosView(xx, &xv));
344   PetscCall(VecGetKokkosViewWrite(yy, &yv));
345   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
346   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
347   PetscCall(VecRestoreKokkosView(xx, &xv));
348   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
349   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
350   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
351   PetscCall(PetscLogGpuTimeEnd());
352   PetscFunctionReturn(PETSC_SUCCESS);
353 }
354 
355 /* y = A^T x */
356 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
357 {
358   Mat_SeqAIJKokkos          *aijkok;
359   const char                *mode;
360   ConstPetscScalarKokkosView xv;
361   PetscScalarKokkosView      yv;
362   KokkosCsrMatrix            csrmat;
363 
364   PetscFunctionBegin;
365   PetscCall(PetscLogGpuTimeBegin());
366   PetscCall(MatSeqAIJKokkosSyncDevice(A));
367   PetscCall(VecGetKokkosView(xx, &xv));
368   PetscCall(VecGetKokkosViewWrite(yy, &yv));
369   if (A->form_explicit_transpose) {
370     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
371     mode = "N";
372   } else {
373     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
374     csrmat = aijkok->csrmat;
375     mode   = "T";
376   }
377   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
378   PetscCall(VecRestoreKokkosView(xx, &xv));
379   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
380   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
381   PetscCall(PetscLogGpuTimeEnd());
382   PetscFunctionReturn(PETSC_SUCCESS);
383 }
384 
385 /* y = A^H x */
386 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
387 {
388   Mat_SeqAIJKokkos          *aijkok;
389   const char                *mode;
390   ConstPetscScalarKokkosView xv;
391   PetscScalarKokkosView      yv;
392   KokkosCsrMatrix            csrmat;
393 
394   PetscFunctionBegin;
395   PetscCall(PetscLogGpuTimeBegin());
396   PetscCall(MatSeqAIJKokkosSyncDevice(A));
397   PetscCall(VecGetKokkosView(xx, &xv));
398   PetscCall(VecGetKokkosViewWrite(yy, &yv));
399   if (A->form_explicit_transpose) {
400     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
401     mode = "N";
402   } else {
403     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
404     csrmat = aijkok->csrmat;
405     mode   = "C";
406   }
407   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
408   PetscCall(VecRestoreKokkosView(xx, &xv));
409   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
410   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
411   PetscCall(PetscLogGpuTimeEnd());
412   PetscFunctionReturn(PETSC_SUCCESS);
413 }
414 
415 /* z = A x + y */
416 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
417 {
418   Mat_SeqAIJKokkos          *aijkok;
419   ConstPetscScalarKokkosView xv, yv;
420   PetscScalarKokkosView      zv;
421 
422   PetscFunctionBegin;
423   PetscCall(PetscLogGpuTimeBegin());
424   PetscCall(MatSeqAIJKokkosSyncDevice(A));
425   PetscCall(VecGetKokkosView(xx, &xv));
426   PetscCall(VecGetKokkosView(yy, &yv));
427   PetscCall(VecGetKokkosViewWrite(zz, &zv));
428   if (zz != yy) Kokkos::deep_copy(zv, yv);
429   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
430   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
431   PetscCall(VecRestoreKokkosView(xx, &xv));
432   PetscCall(VecRestoreKokkosView(yy, &yv));
433   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
434   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
435   PetscCall(PetscLogGpuTimeEnd());
436   PetscFunctionReturn(PETSC_SUCCESS);
437 }
438 
439 /* z = A^T x + y */
440 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
441 {
442   Mat_SeqAIJKokkos          *aijkok;
443   const char                *mode;
444   ConstPetscScalarKokkosView xv, yv;
445   PetscScalarKokkosView      zv;
446   KokkosCsrMatrix            csrmat;
447 
448   PetscFunctionBegin;
449   PetscCall(PetscLogGpuTimeBegin());
450   PetscCall(MatSeqAIJKokkosSyncDevice(A));
451   PetscCall(VecGetKokkosView(xx, &xv));
452   PetscCall(VecGetKokkosView(yy, &yv));
453   PetscCall(VecGetKokkosViewWrite(zz, &zv));
454   if (zz != yy) Kokkos::deep_copy(zv, yv);
455   if (A->form_explicit_transpose) {
456     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
457     mode = "N";
458   } else {
459     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
460     csrmat = aijkok->csrmat;
461     mode   = "T";
462   }
463   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
464   PetscCall(VecRestoreKokkosView(xx, &xv));
465   PetscCall(VecRestoreKokkosView(yy, &yv));
466   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
467   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
468   PetscCall(PetscLogGpuTimeEnd());
469   PetscFunctionReturn(PETSC_SUCCESS);
470 }
471 
472 /* z = A^H x + y */
473 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
474 {
475   Mat_SeqAIJKokkos          *aijkok;
476   const char                *mode;
477   ConstPetscScalarKokkosView xv, yv;
478   PetscScalarKokkosView      zv;
479   KokkosCsrMatrix            csrmat;
480 
481   PetscFunctionBegin;
482   PetscCall(PetscLogGpuTimeBegin());
483   PetscCall(MatSeqAIJKokkosSyncDevice(A));
484   PetscCall(VecGetKokkosView(xx, &xv));
485   PetscCall(VecGetKokkosView(yy, &yv));
486   PetscCall(VecGetKokkosViewWrite(zz, &zv));
487   if (zz != yy) Kokkos::deep_copy(zv, yv);
488   if (A->form_explicit_transpose) {
489     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
490     mode = "N";
491   } else {
492     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
493     csrmat = aijkok->csrmat;
494     mode   = "C";
495   }
496   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
497   PetscCall(VecRestoreKokkosView(xx, &xv));
498   PetscCall(VecRestoreKokkosView(yy, &yv));
499   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
500   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
501   PetscCall(PetscLogGpuTimeEnd());
502   PetscFunctionReturn(PETSC_SUCCESS);
503 }
504 
505 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
506 {
507   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
508 
509   PetscFunctionBegin;
510   switch (op) {
511   case MAT_FORM_EXPLICIT_TRANSPOSE:
512     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
513     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
514     A->form_explicit_transpose = flg;
515     break;
516   default:
517     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
518     break;
519   }
520   PetscFunctionReturn(PETSC_SUCCESS);
521 }
522 
523 /* Depending on reuse, either build a new mat, or use the existing mat */
524 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
525 {
526   Mat_SeqAIJ *aseq;
527 
528   PetscFunctionBegin;
529   PetscCall(PetscKokkosInitializeCheck());
530   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
531     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
532   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
533     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
534   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
535     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
536     PetscCall(PetscFree(A->defaultvectype));
537     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
538     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
539     PetscCall(MatSetOps_SeqAIJKokkos(A));
540     aseq = static_cast<Mat_SeqAIJ *>(A->data);
541     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
542       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
543       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
544     }
545   }
546   PetscFunctionReturn(PETSC_SUCCESS);
547 }
548 
549 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
550    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
551  */
552 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
553 {
554   Mat_SeqAIJ       *bseq;
555   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
556   Mat               mat;
557 
558   PetscFunctionBegin;
559   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
560   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
561   mat = *B;
562   if (A->assembled) {
563     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
564     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
565     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
566     /* Now copy values to B if needed */
567     if (dupOption == MAT_COPY_VALUES) {
568       if (akok->a_dual.need_sync_device()) {
569         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
570         bkok->a_dual.modify_host();
571       } else { /* If device has the latest data, we only copy data on device */
572         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
573         bkok->a_dual.modify_device();
574       }
575     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
576       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
577       bkok->a_dual.modify_host();
578     }
579     mat->spptr = bkok;
580   }
581 
582   PetscCall(PetscFree(mat->defaultvectype));
583   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
584   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
585   PetscCall(MatSetOps_SeqAIJKokkos(mat));
586   PetscFunctionReturn(PETSC_SUCCESS);
587 }
588 
589 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
590 {
591   Mat               At;
592   KokkosCsrMatrix   internT;
593   Mat_SeqAIJKokkos *atkok, *bkok;
594 
595   PetscFunctionBegin;
596   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
597   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
598   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
599     /* Deep copy internT, as we want to isolate the internal transpose */
600     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
601     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
602     if (reuse == MAT_INITIAL_MATRIX) *B = At;
603     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
604   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
605     if ((*B)->assembled) {
606       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
607       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
608       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
609     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
610       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
611       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
612       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
613       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
614       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
615     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
616   }
617   PetscFunctionReturn(PETSC_SUCCESS);
618 }
619 
620 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
621 {
622   Mat_SeqAIJKokkos *aijkok;
623 
624   PetscFunctionBegin;
625   if (A->factortype == MAT_FACTOR_NONE) {
626     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
627     delete aijkok;
628   } else {
629     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
630   }
631   A->spptr = NULL;
632   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
633   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
634   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
635   PetscCall(MatDestroy_SeqAIJ(A));
636   PetscFunctionReturn(PETSC_SUCCESS);
637 }
638 
639 /*MC
640    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
641 
642    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
643 
644    Options Database Key:
645 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
646 
647   Level: beginner
648 
649 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
650 M*/
651 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
652 {
653   PetscFunctionBegin;
654   PetscCall(PetscKokkosInitializeCheck());
655   PetscCall(MatCreate_SeqAIJ(A));
656   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
657   PetscFunctionReturn(PETSC_SUCCESS);
658 }
659 
660 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
661 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
662 {
663   Mat_SeqAIJ         *a, *b;
664   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
665   MatScalarKokkosView aa, ba, ca;
666   MatRowMapKokkosView ai, bi, ci;
667   MatColIdxKokkosView aj, bj, cj;
668   PetscInt            m, n, nnz, aN;
669 
670   PetscFunctionBegin;
671   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
672   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
673   PetscValidPointer(C, 4);
674   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
675   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
676   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
677   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
678 
679   PetscCall(MatSeqAIJKokkosSyncDevice(A));
680   PetscCall(MatSeqAIJKokkosSyncDevice(B));
681   a    = static_cast<Mat_SeqAIJ *>(A->data);
682   b    = static_cast<Mat_SeqAIJ *>(B->data);
683   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
684   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
685   aa   = akok->a_dual.view_device();
686   ai   = akok->i_dual.view_device();
687   ba   = bkok->a_dual.view_device();
688   bi   = bkok->i_dual.view_device();
689   m    = A->rmap->n; /* M, N and nnz of C */
690   n    = A->cmap->n + B->cmap->n;
691   nnz  = a->nz + b->nz;
692   aN   = A->cmap->n; /* N of A */
693   if (reuse == MAT_INITIAL_MATRIX) {
694     aj           = akok->j_dual.view_device();
695     bj           = bkok->j_dual.view_device();
696     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
697     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
698     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
699     ca           = ca_dual.view_device();
700     ci           = ci_dual.view_device();
701     cj           = cj_dual.view_device();
702 
703     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
704     Kokkos::parallel_for(
705       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
706         PetscInt i       = t.league_rank(); /* row i */
707         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
708 
709         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
710                                                    ci(i) = coffset;
711                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
712         });
713 
714         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
715           if (k < alen) {
716             ca(coffset + k) = aa(ai(i) + k);
717             cj(coffset + k) = aj(ai(i) + k);
718           } else {
719             ca(coffset + k) = ba(bi(i) + k - alen);
720             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
721           }
722         });
723       });
724     ca_dual.modify_device();
725     ci_dual.modify_device();
726     cj_dual.modify_device();
727     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
728     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
729   } else if (reuse == MAT_REUSE_MATRIX) {
730     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
731     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
732     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
733     ca   = ckok->a_dual.view_device();
734     ci   = ckok->i_dual.view_device();
735 
736     Kokkos::parallel_for(
737       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
738         PetscInt i    = t.league_rank(); /* row i */
739         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
740         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
741           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
742           else ca(ci(i) + k) = ba(bi(i) + k - alen);
743         });
744       });
745     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
746   }
747   PetscFunctionReturn(PETSC_SUCCESS);
748 }
749 
750 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
751 {
752   PetscFunctionBegin;
753   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
754   PetscFunctionReturn(PETSC_SUCCESS);
755 }
756 
757 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
758 {
759   Mat_Product                 *product = C->product;
760   Mat                          A, B;
761   bool                         transA, transB; /* use bool, since KK needs this type */
762   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
763   Mat_SeqAIJ                  *c;
764   MatProductData_SeqAIJKokkos *pdata;
765   KokkosCsrMatrix              csrmatA, csrmatB;
766 
767   PetscFunctionBegin;
768   MatCheckProduct(C, 1);
769   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
770   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
771 
772   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
773   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
774   // we still do numeric.
775   if (pdata->reusesym) { // numeric reuses results from symbolic
776     pdata->reusesym = PETSC_FALSE;
777     PetscFunctionReturn(PETSC_SUCCESS);
778   }
779 
780   switch (product->type) {
781   case MATPRODUCT_AB:
782     transA = false;
783     transB = false;
784     break;
785   case MATPRODUCT_AtB:
786     transA = true;
787     transB = false;
788     break;
789   case MATPRODUCT_ABt:
790     transA = false;
791     transB = true;
792     break;
793   default:
794     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
795   }
796 
797   A = product->A;
798   B = product->B;
799   PetscCall(MatSeqAIJKokkosSyncDevice(A));
800   PetscCall(MatSeqAIJKokkosSyncDevice(B));
801   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
802   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
803   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
804 
805   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
806 
807   csrmatA = akok->csrmat;
808   csrmatB = bkok->csrmat;
809 
810   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
811   if (transA) {
812     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
813     transA = false;
814   }
815 
816   if (transB) {
817     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
818     transB = false;
819   }
820   PetscCall(PetscLogGpuTimeBegin());
821   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
822 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
823   auto spgemmHandle = pdata->kh.get_spgemm_handle();
824   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
825 #endif
826 
827   PetscCall(PetscLogGpuTimeEnd());
828   PetscCall(MatSeqAIJKokkosModifyDevice(C));
829   /* shorter version of MatAssemblyEnd_SeqAIJ */
830   c = (Mat_SeqAIJ *)C->data;
831   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
832   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
833   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
834   c->reallocs         = 0;
835   C->info.mallocs     = 0;
836   C->info.nz_unneeded = 0;
837   C->assembled = C->was_assembled = PETSC_TRUE;
838   C->num_ass++;
839   PetscFunctionReturn(PETSC_SUCCESS);
840 }
841 
842 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
843 {
844   Mat_Product                 *product = C->product;
845   MatProductType               ptype;
846   Mat                          A, B;
847   bool                         transA, transB;
848   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
849   MatProductData_SeqAIJKokkos *pdata;
850   MPI_Comm                     comm;
851   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
852 
853   PetscFunctionBegin;
854   MatCheckProduct(C, 1);
855   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
856   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
857   A = product->A;
858   B = product->B;
859   PetscCall(MatSeqAIJKokkosSyncDevice(A));
860   PetscCall(MatSeqAIJKokkosSyncDevice(B));
861   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
862   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
863   csrmatA = akok->csrmat;
864   csrmatB = bkok->csrmat;
865 
866   ptype = product->type;
867   // Take advantage of the symmetry if true
868   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
869     ptype                                          = MATPRODUCT_AB;
870     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
871   }
872   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
873     ptype                                          = MATPRODUCT_AB;
874     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
875   }
876 
877   switch (ptype) {
878   case MATPRODUCT_AB:
879     transA = false;
880     transB = false;
881     break;
882   case MATPRODUCT_AtB:
883     transA = true;
884     transB = false;
885     break;
886   case MATPRODUCT_ABt:
887     transA = false;
888     transB = true;
889     break;
890   default:
891     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
892   }
893   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
894   pdata->reusesym = product->api_user;
895 
896   /* TODO: add command line options to select spgemm algorithms */
897   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
898 
899   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
900 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
901   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
902   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
903   #endif
904 #endif
905   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
906 
907   PetscCall(PetscLogGpuTimeBegin());
908   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
909   if (transA) {
910     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
911     transA = false;
912   }
913 
914   if (transB) {
915     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
916     transB = false;
917   }
918 
919   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
920   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
921     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
922     calling new Mat_SeqAIJKokkos().
923     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
924   */
925   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
926 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
927   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
928   auto spgemmHandle = pdata->kh.get_spgemm_handle();
929   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
930 #endif
931   PetscCall(PetscLogGpuTimeEnd());
932 
933   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
934   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
935   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
936   PetscFunctionReturn(PETSC_SUCCESS);
937 }
938 
939 /* handles sparse matrix matrix ops */
940 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
941 {
942   Mat_Product *product = mat->product;
943   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
944 
945   PetscFunctionBegin;
946   MatCheckProduct(mat, 1);
947   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
948   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
949   if (Biskok && Ciskok) {
950     switch (product->type) {
951     case MATPRODUCT_AB:
952     case MATPRODUCT_AtB:
953     case MATPRODUCT_ABt:
954       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
955       break;
956     case MATPRODUCT_PtAP:
957     case MATPRODUCT_RARt:
958     case MATPRODUCT_ABC:
959       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
960       break;
961     default:
962       break;
963     }
964   } else { /* fallback for AIJ */
965     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
966   }
967   PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 
970 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
971 {
972   Mat_SeqAIJKokkos *aijkok;
973 
974   PetscFunctionBegin;
975   PetscCall(PetscLogGpuTimeBegin());
976   PetscCall(MatSeqAIJKokkosSyncDevice(A));
977   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
978   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
979   PetscCall(MatSeqAIJKokkosModifyDevice(A));
980   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
981   PetscCall(PetscLogGpuTimeEnd());
982   PetscFunctionReturn(PETSC_SUCCESS);
983 }
984 
985 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
986 {
987   Mat_SeqAIJKokkos *aijkok;
988 
989   PetscFunctionBegin;
990   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
991   if (aijkok) { /* Only zero the device if data is already there */
992     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
993     PetscCall(MatSeqAIJKokkosModifyDevice(A));
994   } else { /* Might be preallocated but not assembled */
995     PetscCall(MatZeroEntries_SeqAIJ(A));
996   }
997   PetscFunctionReturn(PETSC_SUCCESS);
998 }
999 
1000 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1001 {
1002   Mat_SeqAIJ           *aijseq;
1003   Mat_SeqAIJKokkos     *aijkok;
1004   PetscInt              n;
1005   PetscScalarKokkosView xv;
1006 
1007   PetscFunctionBegin;
1008   PetscCall(VecGetLocalSize(x, &n));
1009   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1010   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1011 
1012   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1013   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1014 
1015   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
1016     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1017     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1018     aijkok->SetDiagonal(aijseq->diag);
1019   }
1020 
1021   const auto &Aa    = aijkok->a_dual.view_device();
1022   const auto &Ai    = aijkok->i_dual.view_device();
1023   const auto &Adiag = aijkok->diag_dual.view_device();
1024 
1025   PetscCall(VecGetKokkosViewWrite(x, &xv));
1026   Kokkos::parallel_for(
1027     n, KOKKOS_LAMBDA(const PetscInt i) {
1028       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1029       else xv(i) = 0;
1030     });
1031   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1032   PetscFunctionReturn(PETSC_SUCCESS);
1033 }
1034 
1035 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1036 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1037 {
1038   Mat_SeqAIJKokkos *aijkok;
1039 
1040   PetscFunctionBegin;
1041   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1042   PetscValidPointer(kv, 2);
1043   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1044   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1045   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1046   *kv    = aijkok->a_dual.view_device();
1047   PetscFunctionReturn(PETSC_SUCCESS);
1048 }
1049 
1050 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1051 {
1052   PetscFunctionBegin;
1053   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1054   PetscValidPointer(kv, 2);
1055   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1056   PetscFunctionReturn(PETSC_SUCCESS);
1057 }
1058 
1059 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1060 {
1061   Mat_SeqAIJKokkos *aijkok;
1062 
1063   PetscFunctionBegin;
1064   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1065   PetscValidPointer(kv, 2);
1066   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1067   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1068   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1069   *kv    = aijkok->a_dual.view_device();
1070   PetscFunctionReturn(PETSC_SUCCESS);
1071 }
1072 
1073 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1074 {
1075   PetscFunctionBegin;
1076   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1077   PetscValidPointer(kv, 2);
1078   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1079   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1080   PetscFunctionReturn(PETSC_SUCCESS);
1081 }
1082 
1083 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1084 {
1085   Mat_SeqAIJKokkos *aijkok;
1086 
1087   PetscFunctionBegin;
1088   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1089   PetscValidPointer(kv, 2);
1090   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1091   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1092   *kv    = aijkok->a_dual.view_device();
1093   PetscFunctionReturn(PETSC_SUCCESS);
1094 }
1095 
1096 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1097 {
1098   PetscFunctionBegin;
1099   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1100   PetscValidPointer(kv, 2);
1101   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1102   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1103   PetscFunctionReturn(PETSC_SUCCESS);
1104 }
1105 
1106 /* Computes Y += alpha X */
1107 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1108 {
1109   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1110   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1111   ConstMatScalarKokkosView Xa;
1112   MatScalarKokkosView      Ya;
1113 
1114   PetscFunctionBegin;
1115   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1116   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1117   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1118   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1119   PetscCall(PetscLogGpuTimeBegin());
1120 
1121   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1122     PetscBool e;
1123     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1124     if (e) {
1125       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1126       if (e) pattern = SAME_NONZERO_PATTERN;
1127     }
1128   }
1129 
1130   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1131     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1132     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1133     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1134   */
1135   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1136   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1137   Xa   = xkok->a_dual.view_device();
1138   Ya   = ykok->a_dual.view_device();
1139 
1140   if (pattern == SAME_NONZERO_PATTERN) {
1141     KokkosBlas::axpy(alpha, Xa, Ya);
1142     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1143   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1144     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1145     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1146 
1147     Kokkos::parallel_for(
1148       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1149         PetscInt i = t.league_rank(); // row i
1150         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1151           // Only one thread works in a team
1152           PetscInt p, q = Yi(i);
1153           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1154             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1155             if (Xj(p) == Yj(q)) {                        // Found it
1156               Ya(q) += alpha * Xa(p);
1157               q++;
1158             } else {
1159             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1160             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1161 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1162               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1163 #else
1164               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1165 #endif
1166             }
1167           }
1168         });
1169       });
1170     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1171   } else { // different nonzero patterns
1172     Mat             Z;
1173     KokkosCsrMatrix zcsr;
1174     KernelHandle    kh;
1175     kh.create_spadd_handle(true); // X, Y are sorted
1176     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1177     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1178     zkok = new Mat_SeqAIJKokkos(zcsr);
1179     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1180     PetscCall(MatHeaderReplace(Y, &Z));
1181     kh.destroy_spadd_handle();
1182   }
1183   PetscCall(PetscLogGpuTimeEnd());
1184   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1185   PetscFunctionReturn(PETSC_SUCCESS);
1186 }
1187 
1188 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1189 {
1190   Mat_SeqAIJKokkos *akok;
1191   Mat_SeqAIJ       *aseq;
1192 
1193   PetscFunctionBegin;
1194   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1195   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1196   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1197   delete akok;
1198   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1199   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1200   akok->SetUpCOO(aseq);
1201   PetscFunctionReturn(PETSC_SUCCESS);
1202 }
1203 
1204 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1205 {
1206   Mat_SeqAIJ                 *aseq = static_cast<Mat_SeqAIJ *>(A->data);
1207   Mat_SeqAIJKokkos           *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1208   PetscCount                  Annz = aseq->nz;
1209   const PetscCountKokkosView &jmap = akok->jmap_d;
1210   const PetscCountKokkosView &perm = akok->perm_d;
1211   MatScalarKokkosView         Aa;
1212   ConstMatScalarKokkosView    kv;
1213   PetscMemType                memtype;
1214 
1215   PetscFunctionBegin;
1216   PetscCall(PetscGetMemType(v, &memtype));
1217   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1218     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
1219   } else {
1220     kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
1221   }
1222 
1223   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1224   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1225 
1226   Kokkos::parallel_for(
1227     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1228       PetscScalar sum = 0.0;
1229       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1230       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1231     });
1232 
1233   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1234   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1235   PetscFunctionReturn(PETSC_SUCCESS);
1236 }
1237 
1238 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag)
1239 {
1240   Mat_SeqAIJKokkos          *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1241   MatScalarKokkosView        Aa;
1242   const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
1243   PetscInt                   m  = A->rmap->n;
1244   ConstMatRowMapKokkosView   Adiag(diag, m); /* diag is a device pointer */
1245 
1246   PetscFunctionBegin;
1247   PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa));
1248   Kokkos::parallel_for(
1249     m, KOKKOS_LAMBDA(const PetscInt i) {
1250       PetscScalar tmp;
1251       if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
1252         tmp          = Aa(Ai(i));
1253         Aa(Ai(i))    = Aa(Adiag(i));
1254         Aa(Adiag(i)) = tmp;
1255       }
1256     });
1257   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1258   PetscFunctionReturn(PETSC_SUCCESS);
1259 }
1260 
1261 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1262 {
1263   PetscFunctionBegin;
1264   PetscCall(MatSeqAIJKokkosSyncHost(A));
1265   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1266   B->offloadmask = PETSC_OFFLOAD_CPU;
1267   PetscFunctionReturn(PETSC_SUCCESS);
1268 }
1269 
1270 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1271 {
1272   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1273 
1274   PetscFunctionBegin;
1275   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1276   A->boundtocpu  = PETSC_FALSE;
1277 
1278   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1279   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1280   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1281   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1282   A->ops->scale                     = MatScale_SeqAIJKokkos;
1283   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1284   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1285   A->ops->mult                      = MatMult_SeqAIJKokkos;
1286   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1287   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1288   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1289   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1290   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1291   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1292   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1293   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1294   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1295   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1296   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1297   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1298   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1299   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1300   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1301   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1302 
1303   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1304   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1305   PetscFunctionReturn(PETSC_SUCCESS);
1306 }
1307 
1308 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1309 {
1310   Mat_SeqAIJ *aseq;
1311   PetscInt    i, m, n;
1312 
1313   PetscFunctionBegin;
1314   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1315 
1316   m = akok->nrows();
1317   n = akok->ncols();
1318   PetscCall(MatSetSizes(A, m, n, m, n));
1319   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1320 
1321   /* Set up data structures of A as a MATSEQAIJ */
1322   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1323   aseq = (Mat_SeqAIJ *)(A)->data;
1324 
1325   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1326   akok->j_dual.sync_host();
1327 
1328   aseq->i            = akok->i_host_data();
1329   aseq->j            = akok->j_host_data();
1330   aseq->a            = akok->a_host_data();
1331   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1332   aseq->singlemalloc = PETSC_FALSE;
1333   aseq->free_a       = PETSC_FALSE;
1334   aseq->free_ij      = PETSC_FALSE;
1335   aseq->nz           = akok->nnz();
1336   aseq->maxnz        = aseq->nz;
1337 
1338   PetscCall(PetscMalloc1(m, &aseq->imax));
1339   PetscCall(PetscMalloc1(m, &aseq->ilen));
1340   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1341 
1342   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1343   akok->nonzerostate = A->nonzerostate;
1344   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1345   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1346   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1347   PetscFunctionReturn(PETSC_SUCCESS);
1348 }
1349 
1350 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1351 {
1352   PetscFunctionBegin;
1353   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1354   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1355   PetscFunctionReturn(PETSC_SUCCESS);
1356 }
1357 
1358 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1359 {
1360   Mat_SeqAIJKokkos *akok;
1361   PetscFunctionBegin;
1362   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1363   PetscCall(MatCreate(comm, A));
1364   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1365   PetscFunctionReturn(PETSC_SUCCESS);
1366 }
1367 
1368 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1369 
1370    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1371  */
1372 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1373 {
1374   PetscFunctionBegin;
1375   PetscCall(MatCreate(comm, A));
1376   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1377   PetscFunctionReturn(PETSC_SUCCESS);
1378 }
1379 
1380 /*@C
1381    MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1382    (the default parallel PETSc format). This matrix will ultimately be handled by
1383    Kokkos for calculations.
1384 
1385    Collective
1386 
1387    Input Parameters:
1388 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
1389 .  m - number of rows
1390 .  n - number of columns
1391 .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1392 -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1393 
1394    Output Parameter:
1395 .  A - the matrix
1396 
1397    Level: intermediate
1398 
1399    Notes:
1400    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1401    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1402    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1403 
1404    The AIJ format, also called
1405    compressed row storage, is fully compatible with standard Fortran
1406    storage.  That is, the stored row and column indices can begin at
1407    either one (as in Fortran) or zero.
1408 
1409    Specify the preallocated storage with either `nz` or `nnz` (not both).
1410    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1411    allocation.
1412 
1413 .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1414 @*/
1415 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1416 {
1417   PetscFunctionBegin;
1418   PetscCall(PetscKokkosInitializeCheck());
1419   PetscCall(MatCreate(comm, A));
1420   PetscCall(MatSetSizes(*A, m, n, m, n));
1421   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1422   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1423   PetscFunctionReturn(PETSC_SUCCESS);
1424 }
1425 
1426 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1427 {
1428   PetscFunctionBegin;
1429   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1430   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1431   PetscFunctionReturn(PETSC_SUCCESS);
1432 }
1433 
1434 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1435 {
1436   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1437 
1438   PetscFunctionBegin;
1439   if (!factors->sptrsv_symbolic_completed) {
1440     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1441     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1442     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1443   }
1444   PetscFunctionReturn(PETSC_SUCCESS);
1445 }
1446 
1447 /* Check if we need to update factors etc for transpose solve */
1448 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1449 {
1450   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1451   MatColIdxType               n       = A->rmap->n;
1452 
1453   PetscFunctionBegin;
1454   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1455     /* Update L^T and do sptrsv symbolic */
1456     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1);
1457     Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */
1458     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0));
1459     factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0));
1460 
1461     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1462                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1463 
1464     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1465       We have to sort the indices, until KK provides finer control options.
1466     */
1467     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1468 
1469     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1470 
1471     /* Update U^T and do sptrsv symbolic */
1472     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1);
1473     Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */
1474     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0));
1475     factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0));
1476 
1477     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1478                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1479 
1480     /* Sort indices. See comments above */
1481     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1482 
1483     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1484     factors->transpose_updated = PETSC_TRUE;
1485   }
1486   PetscFunctionReturn(PETSC_SUCCESS);
1487 }
1488 
1489 /* Solve Ax = b, with A = LU */
1490 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1491 {
1492   ConstPetscScalarKokkosView  bv;
1493   PetscScalarKokkosView       xv;
1494   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1495 
1496   PetscFunctionBegin;
1497   PetscCall(PetscLogGpuTimeBegin());
1498   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1499   PetscCall(VecGetKokkosView(b, &bv));
1500   PetscCall(VecGetKokkosViewWrite(x, &xv));
1501   /* Solve L tmpv = b */
1502   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1503   /* Solve Ux = tmpv */
1504   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1505   PetscCall(VecRestoreKokkosView(b, &bv));
1506   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1507   PetscCall(PetscLogGpuTimeEnd());
1508   PetscFunctionReturn(PETSC_SUCCESS);
1509 }
1510 
1511 /* Solve A^T x = b, where A^T = U^T L^T */
1512 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1513 {
1514   ConstPetscScalarKokkosView  bv;
1515   PetscScalarKokkosView       xv;
1516   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1517 
1518   PetscFunctionBegin;
1519   PetscCall(PetscLogGpuTimeBegin());
1520   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1521   PetscCall(VecGetKokkosView(b, &bv));
1522   PetscCall(VecGetKokkosViewWrite(x, &xv));
1523   /* Solve U^T tmpv = b */
1524   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1525 
1526   /* Solve L^T x = tmpv */
1527   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1528   PetscCall(VecRestoreKokkosView(b, &bv));
1529   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1530   PetscCall(PetscLogGpuTimeEnd());
1531   PetscFunctionReturn(PETSC_SUCCESS);
1532 }
1533 
1534 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1535 {
1536   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1537   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1538   PetscInt                    fill_lev = info->levels;
1539 
1540   PetscFunctionBegin;
1541   PetscCall(PetscLogGpuTimeBegin());
1542   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1543 
1544   auto a_d = aijkok->a_dual.view_device();
1545   auto i_d = aijkok->i_dual.view_device();
1546   auto j_d = aijkok->j_dual.view_device();
1547 
1548   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1549 
1550   B->assembled              = PETSC_TRUE;
1551   B->preallocated           = PETSC_TRUE;
1552   B->ops->solve             = MatSolve_SeqAIJKokkos;
1553   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1554   B->ops->matsolve          = NULL;
1555   B->ops->matsolvetranspose = NULL;
1556   B->offloadmask            = PETSC_OFFLOAD_GPU;
1557 
1558   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1559   factors->transpose_updated         = PETSC_FALSE;
1560   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1561   /* TODO: log flops, but how to know that? */
1562   PetscCall(PetscLogGpuTimeEnd());
1563   PetscFunctionReturn(PETSC_SUCCESS);
1564 }
1565 
1566 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1567 {
1568   Mat_SeqAIJKokkos           *aijkok;
1569   Mat_SeqAIJ                 *b;
1570   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1571   PetscInt                    fill_lev = info->levels;
1572   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1573   PetscInt                    n        = A->rmap->n;
1574 
1575   PetscFunctionBegin;
1576   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1577   /* Rebuild factors */
1578   if (factors) {
1579     factors->Destroy();
1580   } /* Destroy the old if it exists */
1581   else {
1582     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1583   }
1584 
1585   /* Create a spiluk handle and then do symbolic factorization */
1586   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1587   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1588 
1589   auto spiluk_handle = factors->kh.get_spiluk_handle();
1590 
1591   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1592   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1593   Kokkos::realloc(factors->iU_d, n + 1);
1594   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1595 
1596   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1597   auto i_d = aijkok->i_dual.view_device();
1598   auto j_d = aijkok->j_dual.view_device();
1599   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1600   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1601 
1602   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1603   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1604   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1605   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1606 
1607   /* TODO: add options to select sptrsv algorithms */
1608   /* Create sptrsv handles for L, U and their transpose */
1609 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1610   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1611 #else
1612   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1613 #endif
1614 
1615   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1616   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1617   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1618   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1619 
1620   /* Fill fields of the factor matrix B */
1621   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1622   b     = (Mat_SeqAIJ *)B->data;
1623   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1624   B->info.fill_ratio_given  = info->fill;
1625   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1626 
1627   B->offloadmask          = PETSC_OFFLOAD_GPU;
1628   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1629   PetscFunctionReturn(PETSC_SUCCESS);
1630 }
1631 
1632 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1633 {
1634   PetscFunctionBegin;
1635   *type = MATSOLVERKOKKOS;
1636   PetscFunctionReturn(PETSC_SUCCESS);
1637 }
1638 
1639 /*MC
1640   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1641   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1642 
1643   Level: beginner
1644 
1645 .seealso: [](chapter_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1646 M*/
1647 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1648 {
1649   PetscInt n = A->rmap->n;
1650 
1651   PetscFunctionBegin;
1652   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1653   PetscCall(MatSetSizes(*B, n, n, n, n));
1654   (*B)->factortype = ftype;
1655   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1656   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1657 
1658   if (ftype == MAT_FACTOR_LU) {
1659     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1660     (*B)->canuseordering        = PETSC_TRUE;
1661     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1662   } else if (ftype == MAT_FACTOR_ILU) {
1663     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1664     (*B)->canuseordering         = PETSC_FALSE;
1665     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1666   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1667 
1668   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1669   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1670   PetscFunctionReturn(PETSC_SUCCESS);
1671 }
1672 
1673 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1674 {
1675   PetscFunctionBegin;
1676   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1677   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1678   PetscFunctionReturn(PETSC_SUCCESS);
1679 }
1680 
1681 /* Utility to print out a KokkosCsrMatrix for debugging */
1682 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1683 {
1684   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1685   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1686   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1687   const PetscInt    *i  = iv.data();
1688   const PetscInt    *j  = jv.data();
1689   const PetscScalar *a  = av.data();
1690   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1691 
1692   PetscFunctionBegin;
1693   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1694   for (PetscInt k = 0; k < m; k++) {
1695     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1696     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1697     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1698   }
1699   PetscFunctionReturn(PETSC_SUCCESS);
1700 }
1701