xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 30bab8dbce35fbb76248bd0fea1b2bc98f8a4f04)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
18 
19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
20   #include <KokkosSparse_Utils.hpp>
21 using KokkosSparse::sort_crs_matrix;
22 using KokkosSparse::Impl::transpose_matrix;
23 #else
24   #include <KokkosKernels_Sorting.hpp>
25 using KokkosKernels::sort_crs_matrix;
26 using KokkosKernels::Impl::transpose_matrix;
27 #endif
28 
29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
30 
31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33    In the latter case, it is important to set a_dual's sync state correctly.
34  */
35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
36 {
37   Mat_SeqAIJ       *aijseq;
38   Mat_SeqAIJKokkos *aijkok;
39 
40   PetscFunctionBegin;
41   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
42   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
43 
44   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
45   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
46 
47   /* If aijkok does not exist, we just copy i, j to device.
48      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
49      In both cases, we build a new aijkok structure.
50   */
51   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
52     delete aijkok;
53     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
54     A->spptr = aijkok;
55   }
56 
57   if (aijkok->device_mat_d.data()) {
58     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
59   }
60   PetscFunctionReturn(PETSC_SUCCESS);
61 }
62 
63 /* Sync CSR data to device if not yet */
64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
65 {
66   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
67 
68   PetscFunctionBegin;
69   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
70   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
71   if (aijkok->a_dual.need_sync_device()) {
72     aijkok->a_dual.sync_device();
73     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
74     aijkok->hermitian_updated = PETSC_FALSE;
75   }
76   PetscFunctionReturn(PETSC_SUCCESS);
77 }
78 
79 /* Mark the CSR data on device as modified */
80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
81 {
82   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
83 
84   PetscFunctionBegin;
85   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
86   aijkok->a_dual.clear_sync_state();
87   aijkok->a_dual.modify_device();
88   aijkok->transpose_updated = PETSC_FALSE;
89   aijkok->hermitian_updated = PETSC_FALSE;
90   PetscCall(MatSeqAIJInvalidateDiagonal(A));
91   PetscCall(PetscObjectStateIncrease((PetscObject)A));
92   PetscFunctionReturn(PETSC_SUCCESS);
93 }
94 
95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
96 {
97   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
98 
99   PetscFunctionBegin;
100   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
101   /* We do not expect one needs factors on host  */
102   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
103   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
104   aijkok->a_dual.sync_host();
105   PetscFunctionReturn(PETSC_SUCCESS);
106 }
107 
108 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111 
112   PetscFunctionBegin;
113   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
114     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
115     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
116     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
117   */
118   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
119     aijkok->a_dual.sync_host();
120     *array = aijkok->a_dual.view_host().data();
121   } else { /* Happens when calling MatSetValues on a newly created matrix */
122     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
123   }
124   PetscFunctionReturn(PETSC_SUCCESS);
125 }
126 
127 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
128 {
129   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
130 
131   PetscFunctionBegin;
132   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
133   PetscFunctionReturn(PETSC_SUCCESS);
134 }
135 
136 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
137 {
138   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
139 
140   PetscFunctionBegin;
141   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
142     aijkok->a_dual.sync_host();
143     *array = aijkok->a_dual.view_host().data();
144   } else {
145     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
146   }
147   PetscFunctionReturn(PETSC_SUCCESS);
148 }
149 
150 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
151 {
152   PetscFunctionBegin;
153   *array = NULL;
154   PetscFunctionReturn(PETSC_SUCCESS);
155 }
156 
157 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
158 {
159   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
160 
161   PetscFunctionBegin;
162   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163     *array = aijkok->a_dual.view_host().data();
164   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
165     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
166   }
167   PetscFunctionReturn(PETSC_SUCCESS);
168 }
169 
170 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
171 {
172   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
173 
174   PetscFunctionBegin;
175   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
176     aijkok->a_dual.clear_sync_state();
177     aijkok->a_dual.modify_host();
178   }
179   PetscFunctionReturn(PETSC_SUCCESS);
180 }
181 
182 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
183 {
184   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
185 
186   PetscFunctionBegin;
187   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
188 
189   if (i) *i = aijkok->i_device_data();
190   if (j) *j = aijkok->j_device_data();
191   if (a) {
192     aijkok->a_dual.sync_device();
193     *a = aijkok->a_device_data();
194   }
195   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
200 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
201 {
202   Mat_SeqAIJKokkos                            *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
203   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
204 
205   PetscFunctionBegin;
206   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k);
208   Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k);
209   PetscFunctionReturn(PETSC_SUCCESS);
210 }
211 
212 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
213 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
214 {
215   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
216 
217   PetscFunctionBegin;
218   if (aijkok && aijkok->device_mat_d.data()) {
219     *d_mat = aijkok->device_mat_d.data();
220   } else {
221     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
222     *d_mat = NULL;
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 /*
228   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
229 
230   Input Parameter:
231 .  A       - the MATSEQAIJKOKKOS matrix
232 
233   Output Parameters:
234 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
235 -  T_d    - the transpose on device, whose value array is allocated but not initialized
236 */
237 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
238 {
239   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
240   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
241   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
242   MatRowMapKokkosViewHost Ti_h("Ti", n + 1);
243   MatRowMapType          *Ti = Ti_h.data();
244   MatColIdxKokkosViewHost Tj_h("Tj", nz);
245   MatRowMapKokkosViewHost perm_h("permutation", nz);
246   PetscInt               *Tj   = Tj_h.data();
247   PetscInt               *perm = perm_h.data();
248   PetscInt               *offset;
249 
250   PetscFunctionBegin;
251   // Populate Ti
252   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
253   Ti++;
254   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
255   Ti--;
256   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
257 
258   // Populate Tj and the permutation array
259   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
260   for (PetscInt i = 0; i < m; i++) {
261     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
262       PetscInt r    = Aj[j];                       // row r of T
263       PetscInt disp = Ti[r] + offset[r];
264 
265       Tj[disp]   = i; // col i of T
266       perm[disp] = j;
267       offset[r]++;
268     }
269   }
270   PetscCall(PetscFree(offset));
271 
272   // Sort each row of T, along with the permutation array
273   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
274 
275   // Output perm and T on device
276   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
277   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
278   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
279   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
280   PetscFunctionReturn(PETSC_SUCCESS);
281 }
282 
283 // Generate the transpose on device and cache it internally
284 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
285 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
286 {
287   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
288   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
289   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
290   KokkosCsrMatrix  &T = akok->csrmatT;
291 
292   PetscFunctionBegin;
293   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
294   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device
295 
296   const auto &Aa = akok->a_dual.view_device();
297 
298   if (A->symmetric == PETSC_BOOL3_TRUE) {
299     *csrmatT = akok->csrmat;
300   } else {
301     // See if we already have a cached transpose and its value is up to date
302     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
303       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
304         const auto &perm = akok->transpose_perm; // get the permutation array
305         auto       &Ta   = T.values;
306 
307         PetscCallCXX(Kokkos::parallel_for(
308           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
309       }
310     } else { // Generate T of size n x m for the first time
311       MatRowMapKokkosView perm;
312 
313       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
314       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
315       PetscCallCXX(Kokkos::parallel_for(
316         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
317     }
318     akok->transpose_updated = PETSC_TRUE;
319     *csrmatT                = akok->csrmatT;
320   }
321   PetscFunctionReturn(PETSC_SUCCESS);
322 }
323 
324 // Generate the Hermitian on device and cache it internally
325 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
326 {
327   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
328   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
329   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
330   KokkosCsrMatrix  &T = akok->csrmatH;
331 
332   PetscFunctionBegin;
333   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
334   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
335 
336   const auto &Aa = akok->a_dual.view_device();
337 
338   if (A->hermitian == PETSC_BOOL3_TRUE) {
339     *csrmatH = akok->csrmat;
340   } else {
341     // See if we already have a cached hermitian and its value is up to date
342     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
343       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
344         const auto &perm = akok->transpose_perm; // get the permutation array
345         auto       &Ta   = T.values;
346 
347         PetscCallCXX(Kokkos::parallel_for(
348           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
349       }
350     } else { // Generate T of size n x m for the first time
351       MatRowMapKokkosView perm;
352 
353       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
354       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
355       PetscCallCXX(Kokkos::parallel_for(
356         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
357     }
358     akok->hermitian_updated = PETSC_TRUE;
359     *csrmatH                = akok->csrmatH;
360   }
361   PetscFunctionReturn(PETSC_SUCCESS);
362 }
363 
364 /* y = A x */
365 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
366 {
367   Mat_SeqAIJKokkos          *aijkok;
368   ConstPetscScalarKokkosView xv;
369   PetscScalarKokkosView      yv;
370 
371   PetscFunctionBegin;
372   PetscCall(PetscLogGpuTimeBegin());
373   PetscCall(MatSeqAIJKokkosSyncDevice(A));
374   PetscCall(VecGetKokkosView(xx, &xv));
375   PetscCall(VecGetKokkosViewWrite(yy, &yv));
376   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
377   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
378   PetscCall(VecRestoreKokkosView(xx, &xv));
379   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
380   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
381   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
382   PetscCall(PetscLogGpuTimeEnd());
383   PetscFunctionReturn(PETSC_SUCCESS);
384 }
385 
386 /* y = A^T x */
387 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
388 {
389   Mat_SeqAIJKokkos          *aijkok;
390   const char                *mode;
391   ConstPetscScalarKokkosView xv;
392   PetscScalarKokkosView      yv;
393   KokkosCsrMatrix            csrmat;
394 
395   PetscFunctionBegin;
396   PetscCall(PetscLogGpuTimeBegin());
397   PetscCall(MatSeqAIJKokkosSyncDevice(A));
398   PetscCall(VecGetKokkosView(xx, &xv));
399   PetscCall(VecGetKokkosViewWrite(yy, &yv));
400   if (A->form_explicit_transpose) {
401     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
402     mode = "N";
403   } else {
404     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
405     csrmat = aijkok->csrmat;
406     mode   = "T";
407   }
408   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
409   PetscCall(VecRestoreKokkosView(xx, &xv));
410   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
411   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
412   PetscCall(PetscLogGpuTimeEnd());
413   PetscFunctionReturn(PETSC_SUCCESS);
414 }
415 
416 /* y = A^H x */
417 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
418 {
419   Mat_SeqAIJKokkos          *aijkok;
420   const char                *mode;
421   ConstPetscScalarKokkosView xv;
422   PetscScalarKokkosView      yv;
423   KokkosCsrMatrix            csrmat;
424 
425   PetscFunctionBegin;
426   PetscCall(PetscLogGpuTimeBegin());
427   PetscCall(MatSeqAIJKokkosSyncDevice(A));
428   PetscCall(VecGetKokkosView(xx, &xv));
429   PetscCall(VecGetKokkosViewWrite(yy, &yv));
430   if (A->form_explicit_transpose) {
431     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
432     mode = "N";
433   } else {
434     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
435     csrmat = aijkok->csrmat;
436     mode   = "C";
437   }
438   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
439   PetscCall(VecRestoreKokkosView(xx, &xv));
440   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
441   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
442   PetscCall(PetscLogGpuTimeEnd());
443   PetscFunctionReturn(PETSC_SUCCESS);
444 }
445 
446 /* z = A x + y */
447 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
448 {
449   Mat_SeqAIJKokkos          *aijkok;
450   ConstPetscScalarKokkosView xv, yv;
451   PetscScalarKokkosView      zv;
452 
453   PetscFunctionBegin;
454   PetscCall(PetscLogGpuTimeBegin());
455   PetscCall(MatSeqAIJKokkosSyncDevice(A));
456   PetscCall(VecGetKokkosView(xx, &xv));
457   PetscCall(VecGetKokkosView(yy, &yv));
458   PetscCall(VecGetKokkosViewWrite(zz, &zv));
459   if (zz != yy) Kokkos::deep_copy(zv, yv);
460   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
461   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
462   PetscCall(VecRestoreKokkosView(xx, &xv));
463   PetscCall(VecRestoreKokkosView(yy, &yv));
464   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
465   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
466   PetscCall(PetscLogGpuTimeEnd());
467   PetscFunctionReturn(PETSC_SUCCESS);
468 }
469 
470 /* z = A^T x + y */
471 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
472 {
473   Mat_SeqAIJKokkos          *aijkok;
474   const char                *mode;
475   ConstPetscScalarKokkosView xv, yv;
476   PetscScalarKokkosView      zv;
477   KokkosCsrMatrix            csrmat;
478 
479   PetscFunctionBegin;
480   PetscCall(PetscLogGpuTimeBegin());
481   PetscCall(MatSeqAIJKokkosSyncDevice(A));
482   PetscCall(VecGetKokkosView(xx, &xv));
483   PetscCall(VecGetKokkosView(yy, &yv));
484   PetscCall(VecGetKokkosViewWrite(zz, &zv));
485   if (zz != yy) Kokkos::deep_copy(zv, yv);
486   if (A->form_explicit_transpose) {
487     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
488     mode = "N";
489   } else {
490     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
491     csrmat = aijkok->csrmat;
492     mode   = "T";
493   }
494   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
495   PetscCall(VecRestoreKokkosView(xx, &xv));
496   PetscCall(VecRestoreKokkosView(yy, &yv));
497   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
498   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
499   PetscCall(PetscLogGpuTimeEnd());
500   PetscFunctionReturn(PETSC_SUCCESS);
501 }
502 
503 /* z = A^H x + y */
504 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
505 {
506   Mat_SeqAIJKokkos          *aijkok;
507   const char                *mode;
508   ConstPetscScalarKokkosView xv, yv;
509   PetscScalarKokkosView      zv;
510   KokkosCsrMatrix            csrmat;
511 
512   PetscFunctionBegin;
513   PetscCall(PetscLogGpuTimeBegin());
514   PetscCall(MatSeqAIJKokkosSyncDevice(A));
515   PetscCall(VecGetKokkosView(xx, &xv));
516   PetscCall(VecGetKokkosView(yy, &yv));
517   PetscCall(VecGetKokkosViewWrite(zz, &zv));
518   if (zz != yy) Kokkos::deep_copy(zv, yv);
519   if (A->form_explicit_transpose) {
520     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
521     mode = "N";
522   } else {
523     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
524     csrmat = aijkok->csrmat;
525     mode   = "C";
526   }
527   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
528   PetscCall(VecRestoreKokkosView(xx, &xv));
529   PetscCall(VecRestoreKokkosView(yy, &yv));
530   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
531   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
532   PetscCall(PetscLogGpuTimeEnd());
533   PetscFunctionReturn(PETSC_SUCCESS);
534 }
535 
536 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
537 {
538   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
539 
540   PetscFunctionBegin;
541   switch (op) {
542   case MAT_FORM_EXPLICIT_TRANSPOSE:
543     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
544     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
545     A->form_explicit_transpose = flg;
546     break;
547   default:
548     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
549     break;
550   }
551   PetscFunctionReturn(PETSC_SUCCESS);
552 }
553 
554 /* Depending on reuse, either build a new mat, or use the existing mat */
555 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
556 {
557   Mat_SeqAIJ *aseq;
558 
559   PetscFunctionBegin;
560   PetscCall(PetscKokkosInitializeCheck());
561   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
562     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
563   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
564     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
565   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
566     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
567     PetscCall(PetscFree(A->defaultvectype));
568     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
569     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
570     PetscCall(MatSetOps_SeqAIJKokkos(A));
571     aseq = static_cast<Mat_SeqAIJ *>(A->data);
572     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
573       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
574       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
575     }
576   }
577   PetscFunctionReturn(PETSC_SUCCESS);
578 }
579 
580 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
581    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
582  */
583 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
584 {
585   Mat_SeqAIJ       *bseq;
586   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
587   Mat               mat;
588 
589   PetscFunctionBegin;
590   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
591   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
592   mat = *B;
593   if (A->assembled) {
594     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
595     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
596     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
597     /* Now copy values to B if needed */
598     if (dupOption == MAT_COPY_VALUES) {
599       if (akok->a_dual.need_sync_device()) {
600         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
601         bkok->a_dual.modify_host();
602       } else { /* If device has the latest data, we only copy data on device */
603         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
604         bkok->a_dual.modify_device();
605       }
606     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
607       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
608       bkok->a_dual.modify_host();
609     }
610     mat->spptr = bkok;
611   }
612 
613   PetscCall(PetscFree(mat->defaultvectype));
614   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
615   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
616   PetscCall(MatSetOps_SeqAIJKokkos(mat));
617   PetscFunctionReturn(PETSC_SUCCESS);
618 }
619 
620 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
621 {
622   Mat               At;
623   KokkosCsrMatrix   internT;
624   Mat_SeqAIJKokkos *atkok, *bkok;
625 
626   PetscFunctionBegin;
627   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
628   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
629   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
630     /* Deep copy internT, as we want to isolate the internal transpose */
631     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
632     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
633     if (reuse == MAT_INITIAL_MATRIX) *B = At;
634     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
635   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
636     if ((*B)->assembled) {
637       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
638       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
639       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
640     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
641       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
642       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
643       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
644       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
645       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
646     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
647   }
648   PetscFunctionReturn(PETSC_SUCCESS);
649 }
650 
651 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
652 {
653   Mat_SeqAIJKokkos *aijkok;
654 
655   PetscFunctionBegin;
656   if (A->factortype == MAT_FACTOR_NONE) {
657     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
658     delete aijkok;
659   } else {
660     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
661   }
662   A->spptr = NULL;
663   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
664   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
665   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
666   PetscCall(MatDestroy_SeqAIJ(A));
667   PetscFunctionReturn(PETSC_SUCCESS);
668 }
669 
670 /*MC
671    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
672 
673    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
674 
675    Options Database Key:
676 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
677 
678   Level: beginner
679 
680 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
681 M*/
682 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
683 {
684   PetscFunctionBegin;
685   PetscCall(PetscKokkosInitializeCheck());
686   PetscCall(MatCreate_SeqAIJ(A));
687   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
688   PetscFunctionReturn(PETSC_SUCCESS);
689 }
690 
691 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
692 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
693 {
694   Mat_SeqAIJ         *a, *b;
695   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
696   MatScalarKokkosView aa, ba, ca;
697   MatRowMapKokkosView ai, bi, ci;
698   MatColIdxKokkosView aj, bj, cj;
699   PetscInt            m, n, nnz, aN;
700 
701   PetscFunctionBegin;
702   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
703   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
704   PetscValidPointer(C, 4);
705   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
706   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
707   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
708   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
709 
710   PetscCall(MatSeqAIJKokkosSyncDevice(A));
711   PetscCall(MatSeqAIJKokkosSyncDevice(B));
712   a    = static_cast<Mat_SeqAIJ *>(A->data);
713   b    = static_cast<Mat_SeqAIJ *>(B->data);
714   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
715   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
716   aa   = akok->a_dual.view_device();
717   ai   = akok->i_dual.view_device();
718   ba   = bkok->a_dual.view_device();
719   bi   = bkok->i_dual.view_device();
720   m    = A->rmap->n; /* M, N and nnz of C */
721   n    = A->cmap->n + B->cmap->n;
722   nnz  = a->nz + b->nz;
723   aN   = A->cmap->n; /* N of A */
724   if (reuse == MAT_INITIAL_MATRIX) {
725     aj           = akok->j_dual.view_device();
726     bj           = bkok->j_dual.view_device();
727     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
728     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
729     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
730     ca           = ca_dual.view_device();
731     ci           = ci_dual.view_device();
732     cj           = cj_dual.view_device();
733 
734     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
735     Kokkos::parallel_for(
736       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
737         PetscInt i       = t.league_rank(); /* row i */
738         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
739 
740         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
741                                                    ci(i) = coffset;
742                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
743         });
744 
745         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
746           if (k < alen) {
747             ca(coffset + k) = aa(ai(i) + k);
748             cj(coffset + k) = aj(ai(i) + k);
749           } else {
750             ca(coffset + k) = ba(bi(i) + k - alen);
751             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
752           }
753         });
754       });
755     ca_dual.modify_device();
756     ci_dual.modify_device();
757     cj_dual.modify_device();
758     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
759     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
760   } else if (reuse == MAT_REUSE_MATRIX) {
761     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
762     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
763     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
764     ca   = ckok->a_dual.view_device();
765     ci   = ckok->i_dual.view_device();
766 
767     Kokkos::parallel_for(
768       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
769         PetscInt i    = t.league_rank(); /* row i */
770         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
771         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
772           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
773           else ca(ci(i) + k) = ba(bi(i) + k - alen);
774         });
775       });
776     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
777   }
778   PetscFunctionReturn(PETSC_SUCCESS);
779 }
780 
781 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
782 {
783   PetscFunctionBegin;
784   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
785   PetscFunctionReturn(PETSC_SUCCESS);
786 }
787 
788 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
789 {
790   Mat_Product                 *product = C->product;
791   Mat                          A, B;
792   bool                         transA, transB; /* use bool, since KK needs this type */
793   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
794   Mat_SeqAIJ                  *c;
795   MatProductData_SeqAIJKokkos *pdata;
796   KokkosCsrMatrix              csrmatA, csrmatB;
797 
798   PetscFunctionBegin;
799   MatCheckProduct(C, 1);
800   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
801   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
802 
803   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
804   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
805   // we still do numeric.
806   if (pdata->reusesym) { // numeric reuses results from symbolic
807     pdata->reusesym = PETSC_FALSE;
808     PetscFunctionReturn(PETSC_SUCCESS);
809   }
810 
811   switch (product->type) {
812   case MATPRODUCT_AB:
813     transA = false;
814     transB = false;
815     break;
816   case MATPRODUCT_AtB:
817     transA = true;
818     transB = false;
819     break;
820   case MATPRODUCT_ABt:
821     transA = false;
822     transB = true;
823     break;
824   default:
825     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
826   }
827 
828   A = product->A;
829   B = product->B;
830   PetscCall(MatSeqAIJKokkosSyncDevice(A));
831   PetscCall(MatSeqAIJKokkosSyncDevice(B));
832   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
833   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
834   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
835 
836   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
837 
838   csrmatA = akok->csrmat;
839   csrmatB = bkok->csrmat;
840 
841   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
842   if (transA) {
843     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
844     transA = false;
845   }
846 
847   if (transB) {
848     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
849     transB = false;
850   }
851   PetscCall(PetscLogGpuTimeBegin());
852   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
853 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
854   auto spgemmHandle = pdata->kh.get_spgemm_handle();
855   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
856 #endif
857 
858   PetscCall(PetscLogGpuTimeEnd());
859   PetscCall(MatSeqAIJKokkosModifyDevice(C));
860   /* shorter version of MatAssemblyEnd_SeqAIJ */
861   c = (Mat_SeqAIJ *)C->data;
862   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
863   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
864   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
865   c->reallocs         = 0;
866   C->info.mallocs     = 0;
867   C->info.nz_unneeded = 0;
868   C->assembled = C->was_assembled = PETSC_TRUE;
869   C->num_ass++;
870   PetscFunctionReturn(PETSC_SUCCESS);
871 }
872 
873 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
874 {
875   Mat_Product                 *product = C->product;
876   MatProductType               ptype;
877   Mat                          A, B;
878   bool                         transA, transB;
879   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
880   MatProductData_SeqAIJKokkos *pdata;
881   MPI_Comm                     comm;
882   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
883 
884   PetscFunctionBegin;
885   MatCheckProduct(C, 1);
886   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
887   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
888   A = product->A;
889   B = product->B;
890   PetscCall(MatSeqAIJKokkosSyncDevice(A));
891   PetscCall(MatSeqAIJKokkosSyncDevice(B));
892   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
893   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
894   csrmatA = akok->csrmat;
895   csrmatB = bkok->csrmat;
896 
897   ptype = product->type;
898   // Take advantage of the symmetry if true
899   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
900     ptype                                          = MATPRODUCT_AB;
901     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
902   }
903   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
904     ptype                                          = MATPRODUCT_AB;
905     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
906   }
907 
908   switch (ptype) {
909   case MATPRODUCT_AB:
910     transA = false;
911     transB = false;
912     break;
913   case MATPRODUCT_AtB:
914     transA = true;
915     transB = false;
916     break;
917   case MATPRODUCT_ABt:
918     transA = false;
919     transB = true;
920     break;
921   default:
922     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
923   }
924   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
925   pdata->reusesym = product->api_user;
926 
927   /* TODO: add command line options to select spgemm algorithms */
928   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
929 
930   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
931 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
932   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
933   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
934   #endif
935 #endif
936   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
937 
938   PetscCall(PetscLogGpuTimeBegin());
939   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
940   if (transA) {
941     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
942     transA = false;
943   }
944 
945   if (transB) {
946     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
947     transB = false;
948   }
949 
950   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
951   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
952     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
953     calling new Mat_SeqAIJKokkos().
954     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
955   */
956   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
957 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
958   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
959   auto spgemmHandle = pdata->kh.get_spgemm_handle();
960   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
961 #endif
962   PetscCall(PetscLogGpuTimeEnd());
963 
964   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
965   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
966   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
967   PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 
970 /* handles sparse matrix matrix ops */
971 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
972 {
973   Mat_Product *product = mat->product;
974   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
975 
976   PetscFunctionBegin;
977   MatCheckProduct(mat, 1);
978   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
979   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
980   if (Biskok && Ciskok) {
981     switch (product->type) {
982     case MATPRODUCT_AB:
983     case MATPRODUCT_AtB:
984     case MATPRODUCT_ABt:
985       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
986       break;
987     case MATPRODUCT_PtAP:
988     case MATPRODUCT_RARt:
989     case MATPRODUCT_ABC:
990       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
991       break;
992     default:
993       break;
994     }
995   } else { /* fallback for AIJ */
996     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
997   }
998   PetscFunctionReturn(PETSC_SUCCESS);
999 }
1000 
1001 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1002 {
1003   Mat_SeqAIJKokkos *aijkok;
1004 
1005   PetscFunctionBegin;
1006   PetscCall(PetscLogGpuTimeBegin());
1007   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1008   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1009   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1010   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1011   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1012   PetscCall(PetscLogGpuTimeEnd());
1013   PetscFunctionReturn(PETSC_SUCCESS);
1014 }
1015 
1016 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1017 {
1018   Mat_SeqAIJKokkos *aijkok;
1019 
1020   PetscFunctionBegin;
1021   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1022   if (aijkok) { /* Only zero the device if data is already there */
1023     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
1024     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1025   } else { /* Might be preallocated but not assembled */
1026     PetscCall(MatZeroEntries_SeqAIJ(A));
1027   }
1028   PetscFunctionReturn(PETSC_SUCCESS);
1029 }
1030 
1031 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1032 {
1033   Mat_SeqAIJ           *aijseq;
1034   Mat_SeqAIJKokkos     *aijkok;
1035   PetscInt              n;
1036   PetscScalarKokkosView xv;
1037 
1038   PetscFunctionBegin;
1039   PetscCall(VecGetLocalSize(x, &n));
1040   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1041   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1042 
1043   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1044   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1045 
1046   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
1047     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1048     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1049     aijkok->SetDiagonal(aijseq->diag);
1050   }
1051 
1052   const auto &Aa    = aijkok->a_dual.view_device();
1053   const auto &Ai    = aijkok->i_dual.view_device();
1054   const auto &Adiag = aijkok->diag_dual.view_device();
1055 
1056   PetscCall(VecGetKokkosViewWrite(x, &xv));
1057   Kokkos::parallel_for(
1058     n, KOKKOS_LAMBDA(const PetscInt i) {
1059       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1060       else xv(i) = 0;
1061     });
1062   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1063   PetscFunctionReturn(PETSC_SUCCESS);
1064 }
1065 
1066 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1067 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1068 {
1069   Mat_SeqAIJKokkos *aijkok;
1070 
1071   PetscFunctionBegin;
1072   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1073   PetscValidPointer(kv, 2);
1074   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1075   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1076   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1077   *kv    = aijkok->a_dual.view_device();
1078   PetscFunctionReturn(PETSC_SUCCESS);
1079 }
1080 
1081 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1082 {
1083   PetscFunctionBegin;
1084   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1085   PetscValidPointer(kv, 2);
1086   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1087   PetscFunctionReturn(PETSC_SUCCESS);
1088 }
1089 
1090 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1091 {
1092   Mat_SeqAIJKokkos *aijkok;
1093 
1094   PetscFunctionBegin;
1095   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1096   PetscValidPointer(kv, 2);
1097   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1098   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1099   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1100   *kv    = aijkok->a_dual.view_device();
1101   PetscFunctionReturn(PETSC_SUCCESS);
1102 }
1103 
1104 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1105 {
1106   PetscFunctionBegin;
1107   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1108   PetscValidPointer(kv, 2);
1109   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1110   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1111   PetscFunctionReturn(PETSC_SUCCESS);
1112 }
1113 
1114 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1115 {
1116   Mat_SeqAIJKokkos *aijkok;
1117 
1118   PetscFunctionBegin;
1119   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1120   PetscValidPointer(kv, 2);
1121   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1122   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1123   *kv    = aijkok->a_dual.view_device();
1124   PetscFunctionReturn(PETSC_SUCCESS);
1125 }
1126 
1127 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1128 {
1129   PetscFunctionBegin;
1130   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1131   PetscValidPointer(kv, 2);
1132   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1133   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1134   PetscFunctionReturn(PETSC_SUCCESS);
1135 }
1136 
1137 /* Computes Y += alpha X */
1138 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1139 {
1140   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1141   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1142   ConstMatScalarKokkosView Xa;
1143   MatScalarKokkosView      Ya;
1144 
1145   PetscFunctionBegin;
1146   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1147   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1148   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1149   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1150   PetscCall(PetscLogGpuTimeBegin());
1151 
1152   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1153     PetscBool e;
1154     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1155     if (e) {
1156       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1157       if (e) pattern = SAME_NONZERO_PATTERN;
1158     }
1159   }
1160 
1161   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1162     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1163     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1164     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1165   */
1166   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1167   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1168   Xa   = xkok->a_dual.view_device();
1169   Ya   = ykok->a_dual.view_device();
1170 
1171   if (pattern == SAME_NONZERO_PATTERN) {
1172     KokkosBlas::axpy(alpha, Xa, Ya);
1173     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1174   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1175     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1176     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1177 
1178     Kokkos::parallel_for(
1179       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1180         PetscInt i = t.league_rank(); // row i
1181         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1182           // Only one thread works in a team
1183           PetscInt p, q = Yi(i);
1184           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1185             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1186             if (Xj(p) == Yj(q)) {                        // Found it
1187               Ya(q) += alpha * Xa(p);
1188               q++;
1189             } else {
1190             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1191             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1192 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1193               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1194 #else
1195               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1196 #endif
1197             }
1198           }
1199         });
1200       });
1201     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1202   } else { // different nonzero patterns
1203     Mat             Z;
1204     KokkosCsrMatrix zcsr;
1205     KernelHandle    kh;
1206     kh.create_spadd_handle(true); // X, Y are sorted
1207     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1208     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1209     zkok = new Mat_SeqAIJKokkos(zcsr);
1210     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1211     PetscCall(MatHeaderReplace(Y, &Z));
1212     kh.destroy_spadd_handle();
1213   }
1214   PetscCall(PetscLogGpuTimeEnd());
1215   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1216   PetscFunctionReturn(PETSC_SUCCESS);
1217 }
1218 
1219 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1220 {
1221   Mat_SeqAIJKokkos *akok;
1222   Mat_SeqAIJ       *aseq;
1223 
1224   PetscFunctionBegin;
1225   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1226   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1227   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1228   delete akok;
1229   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1230   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1231   akok->SetUpCOO(aseq);
1232   PetscFunctionReturn(PETSC_SUCCESS);
1233 }
1234 
1235 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1236 {
1237   Mat_SeqAIJ                 *aseq = static_cast<Mat_SeqAIJ *>(A->data);
1238   Mat_SeqAIJKokkos           *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1239   PetscCount                  Annz = aseq->nz;
1240   const PetscCountKokkosView &jmap = akok->jmap_d;
1241   const PetscCountKokkosView &perm = akok->perm_d;
1242   MatScalarKokkosView         Aa;
1243   ConstMatScalarKokkosView    kv;
1244   PetscMemType                memtype;
1245 
1246   PetscFunctionBegin;
1247   PetscCall(PetscGetMemType(v, &memtype));
1248   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1249     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
1250   } else {
1251     kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
1252   }
1253 
1254   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1255   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1256 
1257   Kokkos::parallel_for(
1258     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1259       PetscScalar sum = 0.0;
1260       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1261       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1262     });
1263 
1264   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1265   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1266   PetscFunctionReturn(PETSC_SUCCESS);
1267 }
1268 
1269 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag)
1270 {
1271   Mat_SeqAIJKokkos          *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1272   MatScalarKokkosView        Aa;
1273   const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
1274   PetscInt                   m  = A->rmap->n;
1275   ConstMatRowMapKokkosView   Adiag(diag, m); /* diag is a device pointer */
1276 
1277   PetscFunctionBegin;
1278   PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa));
1279   Kokkos::parallel_for(
1280     m, KOKKOS_LAMBDA(const PetscInt i) {
1281       PetscScalar tmp;
1282       if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
1283         tmp          = Aa(Ai(i));
1284         Aa(Ai(i))    = Aa(Adiag(i));
1285         Aa(Adiag(i)) = tmp;
1286       }
1287     });
1288   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1289   PetscFunctionReturn(PETSC_SUCCESS);
1290 }
1291 
1292 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1293 {
1294   PetscFunctionBegin;
1295   PetscCall(MatSeqAIJKokkosSyncHost(A));
1296   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1297   B->offloadmask = PETSC_OFFLOAD_CPU;
1298   PetscFunctionReturn(PETSC_SUCCESS);
1299 }
1300 
1301 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1302 {
1303   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1304 
1305   PetscFunctionBegin;
1306   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1307   A->boundtocpu  = PETSC_FALSE;
1308 
1309   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1310   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1311   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1312   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1313   A->ops->scale                     = MatScale_SeqAIJKokkos;
1314   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1315   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1316   A->ops->mult                      = MatMult_SeqAIJKokkos;
1317   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1318   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1319   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1320   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1321   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1322   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1323   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1324   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1325   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1326   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1327   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1328   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1329   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1330   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1331   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1332   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1333 
1334   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1335   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1336   PetscFunctionReturn(PETSC_SUCCESS);
1337 }
1338 
1339 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1340 {
1341   Mat_SeqAIJ *aseq;
1342   PetscInt    i, m, n;
1343 
1344   PetscFunctionBegin;
1345   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1346 
1347   m = akok->nrows();
1348   n = akok->ncols();
1349   PetscCall(MatSetSizes(A, m, n, m, n));
1350   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1351 
1352   /* Set up data structures of A as a MATSEQAIJ */
1353   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1354   aseq = (Mat_SeqAIJ *)(A)->data;
1355 
1356   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1357   akok->j_dual.sync_host();
1358 
1359   aseq->i            = akok->i_host_data();
1360   aseq->j            = akok->j_host_data();
1361   aseq->a            = akok->a_host_data();
1362   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1363   aseq->singlemalloc = PETSC_FALSE;
1364   aseq->free_a       = PETSC_FALSE;
1365   aseq->free_ij      = PETSC_FALSE;
1366   aseq->nz           = akok->nnz();
1367   aseq->maxnz        = aseq->nz;
1368 
1369   PetscCall(PetscMalloc1(m, &aseq->imax));
1370   PetscCall(PetscMalloc1(m, &aseq->ilen));
1371   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1372 
1373   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1374   akok->nonzerostate = A->nonzerostate;
1375   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1376   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1377   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1378   PetscFunctionReturn(PETSC_SUCCESS);
1379 }
1380 
1381 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1382 {
1383   PetscFunctionBegin;
1384   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1385   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1386   PetscFunctionReturn(PETSC_SUCCESS);
1387 }
1388 
1389 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1390 {
1391   Mat_SeqAIJKokkos *akok;
1392   PetscFunctionBegin;
1393   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1394   PetscCall(MatCreate(comm, A));
1395   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1396   PetscFunctionReturn(PETSC_SUCCESS);
1397 }
1398 
1399 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1400 
1401    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1402  */
1403 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1404 {
1405   PetscFunctionBegin;
1406   PetscCall(MatCreate(comm, A));
1407   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1408   PetscFunctionReturn(PETSC_SUCCESS);
1409 }
1410 
1411 /*@C
1412    MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1413    (the default parallel PETSc format). This matrix will ultimately be handled by
1414    Kokkos for calculations.
1415 
1416    Collective
1417 
1418    Input Parameters:
1419 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
1420 .  m - number of rows
1421 .  n - number of columns
1422 .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1423 -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1424 
1425    Output Parameter:
1426 .  A - the matrix
1427 
1428    Level: intermediate
1429 
1430    Notes:
1431    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1432    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1433    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1434 
1435    The AIJ format, also called
1436    compressed row storage, is fully compatible with standard Fortran
1437    storage.  That is, the stored row and column indices can begin at
1438    either one (as in Fortran) or zero.
1439 
1440    Specify the preallocated storage with either `nz` or `nnz` (not both).
1441    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1442    allocation.
1443 
1444 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1445 @*/
1446 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1447 {
1448   PetscFunctionBegin;
1449   PetscCall(PetscKokkosInitializeCheck());
1450   PetscCall(MatCreate(comm, A));
1451   PetscCall(MatSetSizes(*A, m, n, m, n));
1452   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1453   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1454   PetscFunctionReturn(PETSC_SUCCESS);
1455 }
1456 
1457 typedef Kokkos::TeamPolicy<>::member_type team_member;
1458 //
1459 // This factorization exploits block diagonal matrices with "Nf" (not used).
1460 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1461 //
1462 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info)
1463 {
1464   Mat_SeqAIJ       *b      = (Mat_SeqAIJ *)B->data;
1465   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1466   IS                isrow = b->row, isicol = b->icol;
1467   const PetscInt   *r_h, *ic_h;
1468   const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1469   const PetscScalar *aa_d = aijkok->a_dual.view_device().data();
1470   PetscScalar       *ba_d = baijkok->a_dual.view_device().data();
1471   PetscBool          row_identity, col_identity;
1472   PetscInt           nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used
1473 
1474   PetscFunctionBegin;
1475   PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n);
1476   PetscCall(MatIsStructurallySymmetric(A, &row_identity));
1477   PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported");
1478   PetscCall(ISGetIndices(isrow, &r_h));
1479   PetscCall(ISGetIndices(isicol, &ic_h));
1480   PetscCall(ISGetSize(isicol, &nc));
1481   PetscCall(PetscLogGpuTimeBegin());
1482   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1483   {
1484 #define KOKKOS_SHARED_LEVEL 1
1485     using scr_mem_t    = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1486     using sizet_scr_t  = Kokkos::View<size_t, scr_mem_t>;
1487     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1488     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n);
1489     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_r_k("r", n);
1490     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc);
1491     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_ic_k("ic", nc);
1492     size_t                                                                                                               flops_h = 0.0;
1493     Kokkos::View<size_t, Kokkos::HostSpace>                                                                              h_flops_k(&flops_h);
1494     Kokkos::View<size_t>                                                                                                 d_flops_k("flops");
1495     const int                                                                                                            conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1496     const int                                                                                                            nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1;
1497     Kokkos::deep_copy(d_flops_k, h_flops_k);
1498     Kokkos::deep_copy(d_r_k, h_r_k);
1499     Kokkos::deep_copy(d_ic_k, h_ic_k);
1500     // Fill A --> fact
1501     Kokkos::parallel_for(
1502       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) {
1503         const PetscInt  field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA
1504         const PetscInt  nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
1505         const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data();
1506         // zero rows of B
1507         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1508           PetscInt     nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag
1509           PetscScalar *baL = ba_d + bi_d[rowb];
1510           PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1;
1511           /* zero (unfactored row) */
1512           for (int j = 0; j < nzbL; j++) baL[j] = 0;
1513           for (int j = 0; j < nzbU; j++) baU[j] = 0;
1514         });
1515         // copy A into B
1516         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1517           PetscInt           rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
1518           const PetscScalar *av    = aa_d + ai_d[rowa];
1519           const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1520           /* load in initial (unfactored row) */
1521           for (int j = 0; j < nza; j++) {
1522             PetscInt    colb = ic[ajtmp[j]];
1523             PetscScalar vala = av[j];
1524             if (colb == rowb) {
1525               *(ba_d + bdiag_d[rowb]) = vala;
1526             } else {
1527               const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1528               PetscScalar    *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1529               PetscInt        nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0;
1530               for (int j = 0; j < nz; j++) {
1531                 if (pbj[j] == colb) {
1532                   pba[j] = vala;
1533                   set++;
1534                   break;
1535                 }
1536               }
1537 #if !defined(PETSC_HAVE_SYCL)
1538               if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1539 #else
1540               (void)set;
1541 #endif
1542             }
1543           }
1544         });
1545       });
1546     Kokkos::fence();
1547 
1548     Kokkos::parallel_for(
1549       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) {
1550         sizet_scr_t    colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1551         scalar_scr_t   L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1552         sizet_scr_t    flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1553         const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA
1554         const PetscInt start = field * nloc, end = start + nloc;
1555         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1556         // A22 panel update for each row A(1,:) and col A(:,1)
1557         for (int ii = start; ii < end - 1; ii++) {
1558           const PetscInt    *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end)
1559           const PetscScalar *baUi    = ba_d + bdiag_d[ii + 1] + 1;                                          // vector of data  U(i,i+1:end)
1560           const PetscInt     nUi_its = nzUi / Ni + !!(nzUi % Ni);
1561           const PetscScalar  Bii     = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1562           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) {
1563             PetscInt kIdx = j * Ni + field_block_idx;
1564             if (kIdx >= nzUi) /* void */
1565               ;
1566             else {
1567               const PetscInt  myk = bjUi[kIdx];                // assume symmetric structure, need a transposed meta-data here in general
1568               const PetscInt *pjL = bj_d + bi_d[myk];          // look for L(myk,ii) in start of row
1569               const PetscInt  nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:)
1570               size_t          st_idx;
1571               // find and do L(k,i) = A(:k,i) / A(i,i)
1572               Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1573               // get column, there has got to be a better way
1574               Kokkos::parallel_reduce(
1575                 Kokkos::ThreadVectorRange(team, nzL),
1576                 [&](const int &j, size_t &idx) {
1577                   if (pjL[j] == ii) {
1578                     PetscScalar *pLki = ba_d + bi_d[myk] + j;
1579                     idx               = j;           // output
1580                     *pLki             = *pLki / Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1581                   }
1582                 },
1583                 st_idx);
1584               Kokkos::single(Kokkos::PerThread(team), [=]() {
1585                 colkIdx() = st_idx;
1586                 L_ki()    = *(ba_d + bi_d[myk] + st_idx);
1587               });
1588 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1589               if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register
1590 #endif
1591               // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1592               // U(i+1,:end)
1593               Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U)
1594                 PetscScalar Uij = baUi[uiIdx];
1595                 PetscInt    col = bjUi[uiIdx];
1596                 if (col == myk) {
1597                   // A_kk = A_kk - L_ki * U_ij(k)
1598                   PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1599                   *Akkv             = *Akkv - L_ki() * Uij;  // UiK
1600                 } else {
1601                   PetscScalar    *start, *end, *pAkjv = NULL;
1602                   PetscInt        high, low;
1603                   const PetscInt *startj;
1604                   if (col < myk) { // L
1605                     PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1606                     PetscInt     idx  = (pLki + 1) - (ba_d + bi_d[myk]); // index into row
1607                     start             = pLki + 1;                        // start at pLki+1, A22(myk,1)
1608                     startj            = bj_d + bi_d[myk] + idx;
1609                     end               = ba_d + bi_d[myk + 1];
1610                   } else {
1611                     PetscInt idx = bdiag_d[myk + 1] + 1;
1612                     start        = ba_d + idx;
1613                     startj       = bj_d + idx;
1614                     end          = ba_d + bdiag_d[myk];
1615                   }
1616                   // search for 'col', use bisection search - TODO
1617                   low  = 0;
1618                   high = (PetscInt)(end - start);
1619                   while (high - low > 5) {
1620                     int t = (low + high) / 2;
1621                     if (startj[t] > col) high = t;
1622                     else low = t;
1623                   }
1624                   for (pAkjv = start + low; pAkjv < start + high; pAkjv++) {
1625                     if (startj[pAkjv - start] == col) break;
1626                   }
1627 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1628                   if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register
1629 #endif
1630                   *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1631                 }
1632               });
1633             }
1634           });
1635           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1636           if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); });
1637         } /* endof for (i=0; i<n; i++) { */
1638         Kokkos::single(Kokkos::PerTeam(team), [=]() {
1639           Kokkos::atomic_add(&d_flops_k(), flops());
1640           flops() = 0;
1641         });
1642       });
1643     Kokkos::fence();
1644     Kokkos::deep_copy(h_flops_k, d_flops_k);
1645     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1646     Kokkos::parallel_for(
1647       Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) {
1648         const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni;                            //, field_offset = lg_rank%Ni;
1649         const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters
1650         /* Invert diagonal for simpler triangular solves */
1651         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) {
1652           int i = start + outer_index * Ni + lg_rank % Ni;
1653           if (i < end) {
1654             PetscScalar *pv = ba_d + bdiag_d[i];
1655             *pv             = 1.0 / (*pv);
1656           }
1657         });
1658       });
1659   }
1660   PetscCall(PetscLogGpuTimeEnd());
1661   PetscCall(ISRestoreIndices(isicol, &ic_h));
1662   PetscCall(ISRestoreIndices(isrow, &r_h));
1663 
1664   PetscCall(ISIdentity(isrow, &row_identity));
1665   PetscCall(ISIdentity(isicol, &col_identity));
1666   if (b->inode.size) {
1667     B->ops->solve = MatSolve_SeqAIJ_Inode;
1668   } else if (row_identity && col_identity) {
1669     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1670   } else {
1671     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1672   }
1673   B->offloadmask = PETSC_OFFLOAD_GPU;
1674   PetscCall(MatSeqAIJKokkosSyncHost(B));          // solve on CPU
1675   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1676   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1677   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1678   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1679   B->assembled              = PETSC_TRUE;
1680   B->preallocated           = PETSC_TRUE;
1681 
1682   PetscFunctionReturn(PETSC_SUCCESS);
1683 }
1684 
1685 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1686 {
1687   PetscFunctionBegin;
1688   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1689   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1690   PetscFunctionReturn(PETSC_SUCCESS);
1691 }
1692 
1693 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1694 {
1695   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1696 
1697   PetscFunctionBegin;
1698   if (!factors->sptrsv_symbolic_completed) {
1699     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1700     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1701     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1702   }
1703   PetscFunctionReturn(PETSC_SUCCESS);
1704 }
1705 
1706 /* Check if we need to update factors etc for transpose solve */
1707 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1708 {
1709   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1710   MatColIdxType               n       = A->rmap->n;
1711 
1712   PetscFunctionBegin;
1713   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1714     /* Update L^T and do sptrsv symbolic */
1715     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1);
1716     Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */
1717     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0));
1718     factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0));
1719 
1720     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1721                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1722 
1723     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1724       We have to sort the indices, until KK provides finer control options.
1725     */
1726     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1727 
1728     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1729 
1730     /* Update U^T and do sptrsv symbolic */
1731     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1);
1732     Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */
1733     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0));
1734     factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0));
1735 
1736     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1737                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1738 
1739     /* Sort indices. See comments above */
1740     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1741 
1742     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1743     factors->transpose_updated = PETSC_TRUE;
1744   }
1745   PetscFunctionReturn(PETSC_SUCCESS);
1746 }
1747 
1748 /* Solve Ax = b, with A = LU */
1749 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1750 {
1751   ConstPetscScalarKokkosView  bv;
1752   PetscScalarKokkosView       xv;
1753   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1754 
1755   PetscFunctionBegin;
1756   PetscCall(PetscLogGpuTimeBegin());
1757   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1758   PetscCall(VecGetKokkosView(b, &bv));
1759   PetscCall(VecGetKokkosViewWrite(x, &xv));
1760   /* Solve L tmpv = b */
1761   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1762   /* Solve Ux = tmpv */
1763   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1764   PetscCall(VecRestoreKokkosView(b, &bv));
1765   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1766   PetscCall(PetscLogGpuTimeEnd());
1767   PetscFunctionReturn(PETSC_SUCCESS);
1768 }
1769 
1770 /* Solve A^T x = b, where A^T = U^T L^T */
1771 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1772 {
1773   ConstPetscScalarKokkosView  bv;
1774   PetscScalarKokkosView       xv;
1775   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1776 
1777   PetscFunctionBegin;
1778   PetscCall(PetscLogGpuTimeBegin());
1779   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1780   PetscCall(VecGetKokkosView(b, &bv));
1781   PetscCall(VecGetKokkosViewWrite(x, &xv));
1782   /* Solve U^T tmpv = b */
1783   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1784 
1785   /* Solve L^T x = tmpv */
1786   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1787   PetscCall(VecRestoreKokkosView(b, &bv));
1788   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1789   PetscCall(PetscLogGpuTimeEnd());
1790   PetscFunctionReturn(PETSC_SUCCESS);
1791 }
1792 
1793 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1794 {
1795   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1796   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1797   PetscInt                    fill_lev = info->levels;
1798 
1799   PetscFunctionBegin;
1800   PetscCall(PetscLogGpuTimeBegin());
1801   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1802 
1803   auto a_d = aijkok->a_dual.view_device();
1804   auto i_d = aijkok->i_dual.view_device();
1805   auto j_d = aijkok->j_dual.view_device();
1806 
1807   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1808 
1809   B->assembled              = PETSC_TRUE;
1810   B->preallocated           = PETSC_TRUE;
1811   B->ops->solve             = MatSolve_SeqAIJKokkos;
1812   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1813   B->ops->matsolve          = NULL;
1814   B->ops->matsolvetranspose = NULL;
1815   B->offloadmask            = PETSC_OFFLOAD_GPU;
1816 
1817   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1818   factors->transpose_updated         = PETSC_FALSE;
1819   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1820   /* TODO: log flops, but how to know that? */
1821   PetscCall(PetscLogGpuTimeEnd());
1822   PetscFunctionReturn(PETSC_SUCCESS);
1823 }
1824 
1825 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1826 {
1827   Mat_SeqAIJKokkos           *aijkok;
1828   Mat_SeqAIJ                 *b;
1829   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1830   PetscInt                    fill_lev = info->levels;
1831   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1832   PetscInt                    n        = A->rmap->n;
1833 
1834   PetscFunctionBegin;
1835   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1836   /* Rebuild factors */
1837   if (factors) {
1838     factors->Destroy();
1839   } /* Destroy the old if it exists */
1840   else {
1841     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1842   }
1843 
1844   /* Create a spiluk handle and then do symbolic factorization */
1845   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1846   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1847 
1848   auto spiluk_handle = factors->kh.get_spiluk_handle();
1849 
1850   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1851   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1852   Kokkos::realloc(factors->iU_d, n + 1);
1853   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1854 
1855   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1856   auto i_d = aijkok->i_dual.view_device();
1857   auto j_d = aijkok->j_dual.view_device();
1858   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1859   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1860 
1861   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1862   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1863   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1864   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1865 
1866   /* TODO: add options to select sptrsv algorithms */
1867   /* Create sptrsv handles for L, U and their transpose */
1868 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1869   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1870 #else
1871   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1872 #endif
1873 
1874   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1875   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1876   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1877   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1878 
1879   /* Fill fields of the factor matrix B */
1880   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1881   b     = (Mat_SeqAIJ *)B->data;
1882   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1883   B->info.fill_ratio_given  = info->fill;
1884   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1885 
1886   B->offloadmask          = PETSC_OFFLOAD_GPU;
1887   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1888   PetscFunctionReturn(PETSC_SUCCESS);
1889 }
1890 
1891 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1892 {
1893   Mat_SeqAIJ    *b     = (Mat_SeqAIJ *)B->data;
1894   const PetscInt nrows = A->rmap->n;
1895 
1896   PetscFunctionBegin;
1897   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1898   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1899   // move B data into Kokkos
1900   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1901   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1902   {
1903     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1904     if (!baijkok->diag_d.extent(0)) {
1905       const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1);
1906       baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag));
1907       Kokkos::deep_copy(baijkok->diag_d, h_diag);
1908     }
1909   }
1910   PetscFunctionReturn(PETSC_SUCCESS);
1911 }
1912 
1913 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1914 {
1915   PetscFunctionBegin;
1916   *type = MATSOLVERKOKKOS;
1917   PetscFunctionReturn(PETSC_SUCCESS);
1918 }
1919 
1920 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type)
1921 {
1922   PetscFunctionBegin;
1923   *type = MATSOLVERKOKKOSDEVICE;
1924   PetscFunctionReturn(PETSC_SUCCESS);
1925 }
1926 
1927 /*MC
1928   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1929   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1930 
1931   Level: beginner
1932 
1933 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1934 M*/
1935 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1936 {
1937   PetscInt n = A->rmap->n;
1938 
1939   PetscFunctionBegin;
1940   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1941   PetscCall(MatSetSizes(*B, n, n, n, n));
1942   (*B)->factortype = ftype;
1943   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1944   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1945 
1946   if (ftype == MAT_FACTOR_LU) {
1947     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1948     (*B)->canuseordering        = PETSC_TRUE;
1949     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1950   } else if (ftype == MAT_FACTOR_ILU) {
1951     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1952     (*B)->canuseordering         = PETSC_FALSE;
1953     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1954   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1955 
1956   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1957   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1958   PetscFunctionReturn(PETSC_SUCCESS);
1959 }
1960 
1961 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B)
1962 {
1963   PetscInt n = A->rmap->n;
1964 
1965   PetscFunctionBegin;
1966   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1967   PetscCall(MatSetSizes(*B, n, n, n, n));
1968   (*B)->factortype     = ftype;
1969   (*B)->canuseordering = PETSC_TRUE;
1970   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1971   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1972 
1973   if (ftype == MAT_FACTOR_LU) {
1974     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1975     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1976   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types");
1977 
1978   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1979   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device));
1980   PetscFunctionReturn(PETSC_SUCCESS);
1981 }
1982 
1983 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1984 {
1985   PetscFunctionBegin;
1986   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1987   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1988   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device));
1989   PetscFunctionReturn(PETSC_SUCCESS);
1990 }
1991 
1992 /* Utility to print out a KokkosCsrMatrix for debugging */
1993 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1994 {
1995   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1996   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1997   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1998   const PetscInt    *i  = iv.data();
1999   const PetscInt    *j  = jv.data();
2000   const PetscScalar *a  = av.data();
2001   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2002 
2003   PetscFunctionBegin;
2004   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2005   for (PetscInt k = 0; k < m; k++) {
2006     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2007     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2008     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2009   }
2010   PetscFunctionReturn(PETSC_SUCCESS);
2011 }
2012