xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 12e0ef65d2cdd9d3d28bcc1c862df70a50779cf5)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
18 
19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
20   #include <KokkosSparse_Utils.hpp>
21 using KokkosSparse::sort_crs_matrix;
22 using KokkosSparse::Impl::transpose_matrix;
23 #else
24   #include <KokkosKernels_Sorting.hpp>
25 using KokkosKernels::sort_crs_matrix;
26 using KokkosKernels::Impl::transpose_matrix;
27 #endif
28 
29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
30 
31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33    In the latter case, it is important to set a_dual's sync state correctly.
34  */
35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
36 {
37   Mat_SeqAIJ       *aijseq;
38   Mat_SeqAIJKokkos *aijkok;
39 
40   PetscFunctionBegin;
41   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
42   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
43 
44   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
45   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
46 
47   /* If aijkok does not exist, we just copy i, j to device.
48      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
49      In both cases, we build a new aijkok structure.
50   */
51   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
52     delete aijkok;
53     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
54     A->spptr = aijkok;
55   }
56 
57   if (aijkok->device_mat_d.data()) {
58     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
59   }
60   PetscFunctionReturn(PETSC_SUCCESS);
61 }
62 
63 /* Sync CSR data to device if not yet */
64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
65 {
66   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
67 
68   PetscFunctionBegin;
69   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device");
70   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
71   if (aijkok->a_dual.need_sync_device()) {
72     aijkok->a_dual.sync_device();
73     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
74     aijkok->hermitian_updated = PETSC_FALSE;
75   }
76   PetscFunctionReturn(PETSC_SUCCESS);
77 }
78 
79 /* Mark the CSR data on device as modified */
80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
81 {
82   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
83 
84   PetscFunctionBegin;
85   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
86   aijkok->a_dual.clear_sync_state();
87   aijkok->a_dual.modify_device();
88   aijkok->transpose_updated = PETSC_FALSE;
89   aijkok->hermitian_updated = PETSC_FALSE;
90   PetscCall(MatSeqAIJInvalidateDiagonal(A));
91   PetscCall(PetscObjectStateIncrease((PetscObject)A));
92   PetscFunctionReturn(PETSC_SUCCESS);
93 }
94 
95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
96 {
97   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
98 
99   PetscFunctionBegin;
100   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
101   /* We do not expect one needs factors on host  */
102   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host");
103   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
104   aijkok->a_dual.sync_host();
105   PetscFunctionReturn(PETSC_SUCCESS);
106 }
107 
108 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111 
112   PetscFunctionBegin;
113   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
114     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
115     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
116     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
117   */
118   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
119     aijkok->a_dual.sync_host();
120     *array = aijkok->a_dual.view_host().data();
121   } else { /* Happens when calling MatSetValues on a newly created matrix */
122     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
123   }
124   PetscFunctionReturn(PETSC_SUCCESS);
125 }
126 
127 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
128 {
129   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
130 
131   PetscFunctionBegin;
132   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
133   PetscFunctionReturn(PETSC_SUCCESS);
134 }
135 
136 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
137 {
138   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
139 
140   PetscFunctionBegin;
141   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
142     aijkok->a_dual.sync_host();
143     *array = aijkok->a_dual.view_host().data();
144   } else {
145     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
146   }
147   PetscFunctionReturn(PETSC_SUCCESS);
148 }
149 
150 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
151 {
152   PetscFunctionBegin;
153   *array = NULL;
154   PetscFunctionReturn(PETSC_SUCCESS);
155 }
156 
157 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
158 {
159   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
160 
161   PetscFunctionBegin;
162   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163     *array = aijkok->a_dual.view_host().data();
164   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
165     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
166   }
167   PetscFunctionReturn(PETSC_SUCCESS);
168 }
169 
170 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
171 {
172   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
173 
174   PetscFunctionBegin;
175   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
176     aijkok->a_dual.clear_sync_state();
177     aijkok->a_dual.modify_host();
178   }
179   PetscFunctionReturn(PETSC_SUCCESS);
180 }
181 
182 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
183 {
184   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
185 
186   PetscFunctionBegin;
187   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
188 
189   if (i) *i = aijkok->i_device_data();
190   if (j) *j = aijkok->j_device_data();
191   if (a) {
192     aijkok->a_dual.sync_device();
193     *a = aijkok->a_device_data();
194   }
195   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
200 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
201 {
202   Mat_SeqAIJKokkos                            *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
203   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
204 
205   PetscFunctionBegin;
206   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k);
208   Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k);
209   PetscFunctionReturn(PETSC_SUCCESS);
210 }
211 
212 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
213 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
214 {
215   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
216 
217   PetscFunctionBegin;
218   if (aijkok && aijkok->device_mat_d.data()) {
219     *d_mat = aijkok->device_mat_d.data();
220   } else {
221     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
222     *d_mat = NULL;
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 /*
228   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
229 
230   Input Parameter:
231 .  A       - the MATSEQAIJKOKKOS matrix
232 
233   Output Parameters:
234 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
235 -  T_d    - the transpose on device, whose value array is allcoated but not initialized
236 */
237 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
238 {
239   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
240   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
241   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
242   MatRowMapKokkosViewHost Ti_h("Ti", n + 1);
243   MatRowMapType          *Ti = Ti_h.data();
244   MatColIdxKokkosViewHost Tj_h("Tj", nz);
245   MatRowMapKokkosViewHost perm_h("permutation", nz);
246   PetscInt               *Tj   = Tj_h.data();
247   PetscInt               *perm = perm_h.data();
248   PetscInt               *offset;
249 
250   PetscFunctionBegin;
251   // Populate Ti
252   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
253   Ti++;
254   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
255   Ti--;
256   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
257 
258   // Populate Tj and the permutation array
259   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
260   for (PetscInt i = 0; i < m; i++) {
261     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
262       PetscInt r    = Aj[j];                       // row r of T
263       PetscInt disp = Ti[r] + offset[r];
264 
265       Tj[disp]   = i; // col i of T
266       perm[disp] = j;
267       offset[r]++;
268     }
269   }
270   PetscCall(PetscFree(offset));
271 
272   // Sort each row of T, along with the permutation array
273   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
274 
275   // Output perm and T on device
276   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
277   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
278   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
279   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
280   PetscFunctionReturn(PETSC_SUCCESS);
281 }
282 
283 // Generate the transpose on device and cache it internally
284 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
285 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
286 {
287   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
288   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
289   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
290   KokkosCsrMatrix  &T = akok->csrmatT;
291 
292   PetscFunctionBegin;
293   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
294   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device
295 
296   const auto &Aa = akok->a_dual.view_device();
297 
298   if (A->symmetric == PETSC_BOOL3_TRUE) {
299     *csrmatT = akok->csrmat;
300   } else {
301     // See if we already have a cached transpose and its value is up to date
302     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
303       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
304         const auto &perm = akok->transpose_perm; // get the permutation array
305         auto       &Ta   = T.values;
306 
307         PetscCallCXX(Kokkos::parallel_for(
308           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
309       }
310     } else { // Generate T of size n x m for the first time
311       MatRowMapKokkosView perm;
312 
313       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
314       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
315       PetscCallCXX(Kokkos::parallel_for(
316         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
317     }
318     akok->transpose_updated = PETSC_TRUE;
319     *csrmatT                = akok->csrmatT;
320   }
321   PetscFunctionReturn(PETSC_SUCCESS);
322 }
323 
324 // Generate the Hermitian on device and cache it internally
325 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
326 {
327   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
328   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
329   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
330   KokkosCsrMatrix  &T = akok->csrmatH;
331 
332   PetscFunctionBegin;
333   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
334   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
335 
336   const auto &Aa = akok->a_dual.view_device();
337 
338   if (A->hermitian == PETSC_BOOL3_TRUE) {
339     *csrmatH = akok->csrmat;
340   } else {
341     // See if we already have a cached hermitian and its value is up to date
342     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
343       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
344         const auto &perm = akok->transpose_perm; // get the permutation array
345         auto       &Ta   = T.values;
346 
347         PetscCallCXX(Kokkos::parallel_for(
348           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
349       }
350     } else { // Generate T of size n x m for the first time
351       MatRowMapKokkosView perm;
352 
353       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
354       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
355       PetscCallCXX(Kokkos::parallel_for(
356         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
357     }
358     akok->hermitian_updated = PETSC_TRUE;
359     *csrmatH                = akok->csrmatH;
360   }
361   PetscFunctionReturn(PETSC_SUCCESS);
362 }
363 
364 /* y = A x */
365 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
366 {
367   Mat_SeqAIJKokkos          *aijkok;
368   ConstPetscScalarKokkosView xv;
369   PetscScalarKokkosView      yv;
370 
371   PetscFunctionBegin;
372   PetscCall(PetscLogGpuTimeBegin());
373   PetscCall(MatSeqAIJKokkosSyncDevice(A));
374   PetscCall(VecGetKokkosView(xx, &xv));
375   PetscCall(VecGetKokkosViewWrite(yy, &yv));
376   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
377   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
378   PetscCall(VecRestoreKokkosView(xx, &xv));
379   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
380   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
381   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
382   PetscCall(PetscLogGpuTimeEnd());
383   PetscFunctionReturn(PETSC_SUCCESS);
384 }
385 
386 /* y = A^T x */
387 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
388 {
389   Mat_SeqAIJKokkos          *aijkok;
390   const char                *mode;
391   ConstPetscScalarKokkosView xv;
392   PetscScalarKokkosView      yv;
393   KokkosCsrMatrix            csrmat;
394 
395   PetscFunctionBegin;
396   PetscCall(PetscLogGpuTimeBegin());
397   PetscCall(MatSeqAIJKokkosSyncDevice(A));
398   PetscCall(VecGetKokkosView(xx, &xv));
399   PetscCall(VecGetKokkosViewWrite(yy, &yv));
400   if (A->form_explicit_transpose) {
401     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
402     mode = "N";
403   } else {
404     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
405     csrmat = aijkok->csrmat;
406     mode   = "T";
407   }
408   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
409   PetscCall(VecRestoreKokkosView(xx, &xv));
410   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
411   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
412   PetscCall(PetscLogGpuTimeEnd());
413   PetscFunctionReturn(PETSC_SUCCESS);
414 }
415 
416 /* y = A^H x */
417 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
418 {
419   Mat_SeqAIJKokkos          *aijkok;
420   const char                *mode;
421   ConstPetscScalarKokkosView xv;
422   PetscScalarKokkosView      yv;
423   KokkosCsrMatrix            csrmat;
424 
425   PetscFunctionBegin;
426   PetscCall(PetscLogGpuTimeBegin());
427   PetscCall(MatSeqAIJKokkosSyncDevice(A));
428   PetscCall(VecGetKokkosView(xx, &xv));
429   PetscCall(VecGetKokkosViewWrite(yy, &yv));
430   if (A->form_explicit_transpose) {
431     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
432     mode = "N";
433   } else {
434     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
435     csrmat = aijkok->csrmat;
436     mode   = "C";
437   }
438   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
439   PetscCall(VecRestoreKokkosView(xx, &xv));
440   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
441   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
442   PetscCall(PetscLogGpuTimeEnd());
443   PetscFunctionReturn(PETSC_SUCCESS);
444 }
445 
446 /* z = A x + y */
447 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
448 {
449   Mat_SeqAIJKokkos          *aijkok;
450   ConstPetscScalarKokkosView xv, yv;
451   PetscScalarKokkosView      zv;
452 
453   PetscFunctionBegin;
454   PetscCall(PetscLogGpuTimeBegin());
455   PetscCall(MatSeqAIJKokkosSyncDevice(A));
456   PetscCall(VecGetKokkosView(xx, &xv));
457   PetscCall(VecGetKokkosView(yy, &yv));
458   PetscCall(VecGetKokkosViewWrite(zz, &zv));
459   if (zz != yy) Kokkos::deep_copy(zv, yv);
460   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
461   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
462   PetscCall(VecRestoreKokkosView(xx, &xv));
463   PetscCall(VecRestoreKokkosView(yy, &yv));
464   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
465   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
466   PetscCall(PetscLogGpuTimeEnd());
467   PetscFunctionReturn(PETSC_SUCCESS);
468 }
469 
470 /* z = A^T x + y */
471 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
472 {
473   Mat_SeqAIJKokkos          *aijkok;
474   const char                *mode;
475   ConstPetscScalarKokkosView xv, yv;
476   PetscScalarKokkosView      zv;
477   KokkosCsrMatrix            csrmat;
478 
479   PetscFunctionBegin;
480   PetscCall(PetscLogGpuTimeBegin());
481   PetscCall(MatSeqAIJKokkosSyncDevice(A));
482   PetscCall(VecGetKokkosView(xx, &xv));
483   PetscCall(VecGetKokkosView(yy, &yv));
484   PetscCall(VecGetKokkosViewWrite(zz, &zv));
485   if (zz != yy) Kokkos::deep_copy(zv, yv);
486   if (A->form_explicit_transpose) {
487     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
488     mode = "N";
489   } else {
490     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
491     csrmat = aijkok->csrmat;
492     mode   = "T";
493   }
494   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
495   PetscCall(VecRestoreKokkosView(xx, &xv));
496   PetscCall(VecRestoreKokkosView(yy, &yv));
497   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
498   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
499   PetscCall(PetscLogGpuTimeEnd());
500   PetscFunctionReturn(PETSC_SUCCESS);
501 }
502 
503 /* z = A^H x + y */
504 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
505 {
506   Mat_SeqAIJKokkos          *aijkok;
507   const char                *mode;
508   ConstPetscScalarKokkosView xv, yv;
509   PetscScalarKokkosView      zv;
510   KokkosCsrMatrix            csrmat;
511 
512   PetscFunctionBegin;
513   PetscCall(PetscLogGpuTimeBegin());
514   PetscCall(MatSeqAIJKokkosSyncDevice(A));
515   PetscCall(VecGetKokkosView(xx, &xv));
516   PetscCall(VecGetKokkosView(yy, &yv));
517   PetscCall(VecGetKokkosViewWrite(zz, &zv));
518   if (zz != yy) Kokkos::deep_copy(zv, yv);
519   if (A->form_explicit_transpose) {
520     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
521     mode = "N";
522   } else {
523     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
524     csrmat = aijkok->csrmat;
525     mode   = "C";
526   }
527   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
528   PetscCall(VecRestoreKokkosView(xx, &xv));
529   PetscCall(VecRestoreKokkosView(yy, &yv));
530   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
531   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
532   PetscCall(PetscLogGpuTimeEnd());
533   PetscFunctionReturn(PETSC_SUCCESS);
534 }
535 
536 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
537 {
538   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
539 
540   PetscFunctionBegin;
541   switch (op) {
542   case MAT_FORM_EXPLICIT_TRANSPOSE:
543     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
544     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
545     A->form_explicit_transpose = flg;
546     break;
547   default:
548     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
549     break;
550   }
551   PetscFunctionReturn(PETSC_SUCCESS);
552 }
553 
554 /* Depending on reuse, either build a new mat, or use the existing mat */
555 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
556 {
557   Mat_SeqAIJ *aseq;
558 
559   PetscFunctionBegin;
560   PetscCall(PetscKokkosInitializeCheck());
561   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
562     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
563   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
564     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
565   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
566     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
567     PetscCall(PetscFree(A->defaultvectype));
568     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
569     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
570     PetscCall(MatSetOps_SeqAIJKokkos(A));
571     aseq = static_cast<Mat_SeqAIJ *>(A->data);
572     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
573       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
574       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
575     }
576   }
577   PetscFunctionReturn(PETSC_SUCCESS);
578 }
579 
580 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
581    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
582  */
583 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
584 {
585   Mat_SeqAIJ       *bseq;
586   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
587   Mat               mat;
588 
589   PetscFunctionBegin;
590   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
591   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
592   mat = *B;
593   if (A->assembled) {
594     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
595     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
596     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
597     /* Now copy values to B if needed */
598     if (dupOption == MAT_COPY_VALUES) {
599       if (akok->a_dual.need_sync_device()) {
600         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
601         bkok->a_dual.modify_host();
602       } else { /* If device has the latest data, we only copy data on device */
603         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
604         bkok->a_dual.modify_device();
605       }
606     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
607       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
608       bkok->a_dual.modify_host();
609     }
610     mat->spptr = bkok;
611   }
612 
613   PetscCall(PetscFree(mat->defaultvectype));
614   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
615   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
616   PetscCall(MatSetOps_SeqAIJKokkos(mat));
617   PetscFunctionReturn(PETSC_SUCCESS);
618 }
619 
620 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
621 {
622   Mat               At;
623   KokkosCsrMatrix   internT;
624   Mat_SeqAIJKokkos *atkok, *bkok;
625 
626   PetscFunctionBegin;
627   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
628   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
629   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
630     /* Deep copy internT, as we want to isolate the internal transpose */
631     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
632     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
633     if (reuse == MAT_INITIAL_MATRIX) *B = At;
634     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
635   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
636     if ((*B)->assembled) {
637       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
638       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
639       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
640     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
641       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
642       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
643       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
644       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
645       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
646     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
647   }
648   PetscFunctionReturn(PETSC_SUCCESS);
649 }
650 
651 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
652 {
653   Mat_SeqAIJKokkos *aijkok;
654 
655   PetscFunctionBegin;
656   if (A->factortype == MAT_FACTOR_NONE) {
657     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
658     delete aijkok;
659   } else {
660     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
661   }
662   A->spptr = NULL;
663   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
664   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
665   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
666   PetscCall(MatDestroy_SeqAIJ(A));
667   PetscFunctionReturn(PETSC_SUCCESS);
668 }
669 
670 /*MC
671    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
672 
673    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
674 
675    Options Database Key:
676 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
677 
678   Level: beginner
679 
680 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
681 M*/
682 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
683 {
684   PetscFunctionBegin;
685   PetscCall(PetscKokkosInitializeCheck());
686   PetscCall(MatCreate_SeqAIJ(A));
687   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
688   PetscFunctionReturn(PETSC_SUCCESS);
689 }
690 
691 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
692 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
693 {
694   Mat_SeqAIJ         *a, *b;
695   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
696   MatScalarKokkosView aa, ba, ca;
697   MatRowMapKokkosView ai, bi, ci;
698   MatColIdxKokkosView aj, bj, cj;
699   PetscInt            m, n, nnz, aN;
700 
701   PetscFunctionBegin;
702   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
703   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
704   PetscValidPointer(C, 4);
705   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
706   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
707   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
708   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
709 
710   PetscCall(MatSeqAIJKokkosSyncDevice(A));
711   PetscCall(MatSeqAIJKokkosSyncDevice(B));
712   a    = static_cast<Mat_SeqAIJ *>(A->data);
713   b    = static_cast<Mat_SeqAIJ *>(B->data);
714   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
715   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
716   aa   = akok->a_dual.view_device();
717   ai   = akok->i_dual.view_device();
718   ba   = bkok->a_dual.view_device();
719   bi   = bkok->i_dual.view_device();
720   m    = A->rmap->n; /* M, N and nnz of C */
721   n    = A->cmap->n + B->cmap->n;
722   nnz  = a->nz + b->nz;
723   aN   = A->cmap->n; /* N of A */
724   if (reuse == MAT_INITIAL_MATRIX) {
725     aj           = akok->j_dual.view_device();
726     bj           = bkok->j_dual.view_device();
727     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
728     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
729     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
730     ca           = ca_dual.view_device();
731     ci           = ci_dual.view_device();
732     cj           = cj_dual.view_device();
733 
734     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
735     Kokkos::parallel_for(
736       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
737         PetscInt i       = t.league_rank(); /* row i */
738         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
739 
740         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
741                                                    ci(i) = coffset;
742                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
743         });
744 
745         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
746           if (k < alen) {
747             ca(coffset + k) = aa(ai(i) + k);
748             cj(coffset + k) = aj(ai(i) + k);
749           } else {
750             ca(coffset + k) = ba(bi(i) + k - alen);
751             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
752           }
753         });
754       });
755     ca_dual.modify_device();
756     ci_dual.modify_device();
757     cj_dual.modify_device();
758     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
759     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
760   } else if (reuse == MAT_REUSE_MATRIX) {
761     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
762     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
763     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
764     ca   = ckok->a_dual.view_device();
765     ci   = ckok->i_dual.view_device();
766 
767     Kokkos::parallel_for(
768       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
769         PetscInt i    = t.league_rank(); /* row i */
770         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
771         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
772           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
773           else ca(ci(i) + k) = ba(bi(i) + k - alen);
774         });
775       });
776     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
777   }
778   PetscFunctionReturn(PETSC_SUCCESS);
779 }
780 
781 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
782 {
783   PetscFunctionBegin;
784   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
785   PetscFunctionReturn(PETSC_SUCCESS);
786 }
787 
788 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
789 {
790   Mat_Product                 *product = C->product;
791   Mat                          A, B;
792   bool                         transA, transB; /* use bool, since KK needs this type */
793   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
794   Mat_SeqAIJ                  *c;
795   MatProductData_SeqAIJKokkos *pdata;
796   KokkosCsrMatrix              csrmatA, csrmatB;
797 
798   PetscFunctionBegin;
799   MatCheckProduct(C, 1);
800   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
801   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
802 
803   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
804   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
805   // we still do numeric.
806   if (pdata->reusesym) { // numeric reuses results from symbolic
807     pdata->reusesym = PETSC_FALSE;
808     PetscFunctionReturn(PETSC_SUCCESS);
809   }
810 
811   switch (product->type) {
812   case MATPRODUCT_AB:
813     transA = false;
814     transB = false;
815     break;
816   case MATPRODUCT_AtB:
817     transA = true;
818     transB = false;
819     break;
820   case MATPRODUCT_ABt:
821     transA = false;
822     transB = true;
823     break;
824   default:
825     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
826   }
827 
828   A = product->A;
829   B = product->B;
830   PetscCall(MatSeqAIJKokkosSyncDevice(A));
831   PetscCall(MatSeqAIJKokkosSyncDevice(B));
832   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
833   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
834   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
835 
836   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
837 
838   csrmatA = akok->csrmat;
839   csrmatB = bkok->csrmat;
840 
841   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
842   if (transA) {
843     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
844     transA = false;
845   }
846 
847   if (transB) {
848     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
849     transB = false;
850   }
851   PetscCall(PetscLogGpuTimeBegin());
852   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
853 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
854   auto spgemmHandle = pdata->kh.get_spgemm_handle();
855   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
856 #endif
857 
858   PetscCall(PetscLogGpuTimeEnd());
859   PetscCall(MatSeqAIJKokkosModifyDevice(C));
860   /* shorter version of MatAssemblyEnd_SeqAIJ */
861   c = (Mat_SeqAIJ *)C->data;
862   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
863   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
864   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
865   c->reallocs         = 0;
866   C->info.mallocs     = 0;
867   C->info.nz_unneeded = 0;
868   C->assembled = C->was_assembled = PETSC_TRUE;
869   C->num_ass++;
870   PetscFunctionReturn(PETSC_SUCCESS);
871 }
872 
873 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
874 {
875   Mat_Product                 *product = C->product;
876   MatProductType               ptype;
877   Mat                          A, B;
878   bool                         transA, transB;
879   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
880   MatProductData_SeqAIJKokkos *pdata;
881   MPI_Comm                     comm;
882   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
883 
884   PetscFunctionBegin;
885   MatCheckProduct(C, 1);
886   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
887   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
888   A = product->A;
889   B = product->B;
890   PetscCall(MatSeqAIJKokkosSyncDevice(A));
891   PetscCall(MatSeqAIJKokkosSyncDevice(B));
892   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
893   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
894   csrmatA = akok->csrmat;
895   csrmatB = bkok->csrmat;
896 
897   ptype = product->type;
898   // Take advantage of the symmetry if true
899   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
900     ptype                                          = MATPRODUCT_AB;
901     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
902   }
903   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
904     ptype                                          = MATPRODUCT_AB;
905     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
906   }
907 
908   switch (ptype) {
909   case MATPRODUCT_AB:
910     transA = false;
911     transB = false;
912     break;
913   case MATPRODUCT_AtB:
914     transA = true;
915     transB = false;
916     break;
917   case MATPRODUCT_ABt:
918     transA = false;
919     transB = true;
920     break;
921   default:
922     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
923   }
924   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
925   pdata->reusesym = product->api_user;
926 
927   /* TODO: add command line options to select spgemm algorithms */
928   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
929 
930   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
931 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
932   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
933   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
934   #endif
935 #endif
936   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
937 
938   PetscCall(PetscLogGpuTimeBegin());
939   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
940   if (transA) {
941     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
942     transA = false;
943   }
944 
945   if (transB) {
946     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
947     transB = false;
948   }
949 
950   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
951   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
952     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
953     calling new Mat_SeqAIJKokkos().
954     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
955   */
956   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
957 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
958   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
959   auto spgemmHandle = pdata->kh.get_spgemm_handle();
960   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
961 #endif
962   PetscCall(PetscLogGpuTimeEnd());
963 
964   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
965   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
966   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
967   PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 
970 /* handles sparse matrix matrix ops */
971 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
972 {
973   Mat_Product *product = mat->product;
974   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
975 
976   PetscFunctionBegin;
977   MatCheckProduct(mat, 1);
978   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
979   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
980   if (Biskok && Ciskok) {
981     switch (product->type) {
982     case MATPRODUCT_AB:
983     case MATPRODUCT_AtB:
984     case MATPRODUCT_ABt:
985       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
986       break;
987     case MATPRODUCT_PtAP:
988     case MATPRODUCT_RARt:
989     case MATPRODUCT_ABC:
990       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
991       break;
992     default:
993       break;
994     }
995   } else { /* fallback for AIJ */
996     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
997   }
998   PetscFunctionReturn(PETSC_SUCCESS);
999 }
1000 
1001 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1002 {
1003   Mat_SeqAIJKokkos *aijkok;
1004 
1005   PetscFunctionBegin;
1006   PetscCall(PetscLogGpuTimeBegin());
1007   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1008   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1009   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1010   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1011   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1012   PetscCall(PetscLogGpuTimeEnd());
1013   PetscFunctionReturn(PETSC_SUCCESS);
1014 }
1015 
1016 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1017 {
1018   Mat_SeqAIJKokkos *aijkok;
1019 
1020   PetscFunctionBegin;
1021   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1022   if (aijkok) { /* Only zero the device if data is already there */
1023     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
1024     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1025   } else { /* Might be preallocated but not assembled */
1026     PetscCall(MatZeroEntries_SeqAIJ(A));
1027   }
1028   PetscFunctionReturn(PETSC_SUCCESS);
1029 }
1030 
1031 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1032 {
1033   Mat_SeqAIJ           *aijseq;
1034   Mat_SeqAIJKokkos     *aijkok;
1035   PetscInt              n;
1036   PetscScalarKokkosView xv;
1037 
1038   PetscFunctionBegin;
1039   PetscCall(VecGetLocalSize(x, &n));
1040   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1041   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1042 
1043   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1044   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1045 
1046   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
1047     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1048     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1049     aijkok->SetDiagonal(aijseq->diag);
1050   }
1051 
1052   const auto &Aa    = aijkok->a_dual.view_device();
1053   const auto &Ai    = aijkok->i_dual.view_device();
1054   const auto &Adiag = aijkok->diag_dual.view_device();
1055 
1056   PetscCall(VecGetKokkosViewWrite(x, &xv));
1057   Kokkos::parallel_for(
1058     n, KOKKOS_LAMBDA(const PetscInt i) {
1059       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1060       else xv(i) = 0;
1061     });
1062   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1063   PetscFunctionReturn(PETSC_SUCCESS);
1064 }
1065 
1066 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1067 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1068 {
1069   Mat_SeqAIJKokkos *aijkok;
1070 
1071   PetscFunctionBegin;
1072   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1073   PetscValidPointer(kv, 2);
1074   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1075   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1076   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1077   *kv    = aijkok->a_dual.view_device();
1078   PetscFunctionReturn(PETSC_SUCCESS);
1079 }
1080 
1081 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1082 {
1083   PetscFunctionBegin;
1084   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1085   PetscValidPointer(kv, 2);
1086   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1087   PetscFunctionReturn(PETSC_SUCCESS);
1088 }
1089 
1090 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1091 {
1092   Mat_SeqAIJKokkos *aijkok;
1093 
1094   PetscFunctionBegin;
1095   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1096   PetscValidPointer(kv, 2);
1097   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1098   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1099   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1100   *kv    = aijkok->a_dual.view_device();
1101   PetscFunctionReturn(PETSC_SUCCESS);
1102 }
1103 
1104 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1105 {
1106   PetscFunctionBegin;
1107   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1108   PetscValidPointer(kv, 2);
1109   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1110   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1111   PetscFunctionReturn(PETSC_SUCCESS);
1112 }
1113 
1114 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1115 {
1116   Mat_SeqAIJKokkos *aijkok;
1117 
1118   PetscFunctionBegin;
1119   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1120   PetscValidPointer(kv, 2);
1121   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1122   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1123   *kv    = aijkok->a_dual.view_device();
1124   PetscFunctionReturn(PETSC_SUCCESS);
1125 }
1126 
1127 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1128 {
1129   PetscFunctionBegin;
1130   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1131   PetscValidPointer(kv, 2);
1132   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1133   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1134   PetscFunctionReturn(PETSC_SUCCESS);
1135 }
1136 
1137 /* Computes Y += alpha X */
1138 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1139 {
1140   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1141   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1142   ConstMatScalarKokkosView Xa;
1143   MatScalarKokkosView      Ya;
1144 
1145   PetscFunctionBegin;
1146   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1147   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1148   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1149   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1150   PetscCall(PetscLogGpuTimeBegin());
1151 
1152   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1153     PetscBool e;
1154     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1155     if (e) {
1156       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1157       if (e) pattern = SAME_NONZERO_PATTERN;
1158     }
1159   }
1160 
1161   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1162     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1163     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1164     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1165   */
1166   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1167   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1168   Xa   = xkok->a_dual.view_device();
1169   Ya   = ykok->a_dual.view_device();
1170 
1171   if (pattern == SAME_NONZERO_PATTERN) {
1172     KokkosBlas::axpy(alpha, Xa, Ya);
1173     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1174   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1175     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1176     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1177 
1178     Kokkos::parallel_for(
1179       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1180         PetscInt i = t.league_rank(); // row i
1181         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1182           // Only one thread works in a team
1183           PetscInt p, q = Yi(i);
1184           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1185             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1186             if (Xj(p) == Yj(q)) {                        // Found it
1187               Ya(q) += alpha * Xa(p);
1188               q++;
1189             } else {
1190             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1191             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1192 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1193               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1194 #else
1195               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1196 #endif
1197             }
1198           }
1199         });
1200       });
1201     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1202   } else { // different nonzero patterns
1203     Mat             Z;
1204     KokkosCsrMatrix zcsr;
1205     KernelHandle    kh;
1206     kh.create_spadd_handle(true); // X, Y are sorted
1207     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1208     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1209     zkok = new Mat_SeqAIJKokkos(zcsr);
1210     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1211     PetscCall(MatHeaderReplace(Y, &Z));
1212     kh.destroy_spadd_handle();
1213   }
1214   PetscCall(PetscLogGpuTimeEnd());
1215   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1216   PetscFunctionReturn(PETSC_SUCCESS);
1217 }
1218 
1219 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1220 {
1221   Mat_SeqAIJKokkos *akok;
1222   Mat_SeqAIJ       *aseq;
1223 
1224   PetscFunctionBegin;
1225   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1226   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1227   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1228   delete akok;
1229   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1230   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1231   akok->SetUpCOO(aseq);
1232   PetscFunctionReturn(PETSC_SUCCESS);
1233 }
1234 
1235 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1236 {
1237   Mat_SeqAIJ                 *aseq = static_cast<Mat_SeqAIJ *>(A->data);
1238   Mat_SeqAIJKokkos           *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1239   PetscCount                  Annz = aseq->nz;
1240   const PetscCountKokkosView &jmap = akok->jmap_d;
1241   const PetscCountKokkosView &perm = akok->perm_d;
1242   MatScalarKokkosView         Aa;
1243   ConstMatScalarKokkosView    kv;
1244   PetscMemType                memtype;
1245 
1246   PetscFunctionBegin;
1247   PetscCall(PetscGetMemType(v, &memtype));
1248   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1249     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
1250   } else {
1251     kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
1252   }
1253 
1254   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1255   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1256 
1257   Kokkos::parallel_for(
1258     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1259       PetscScalar sum = 0.0;
1260       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1261       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1262     });
1263 
1264   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1265   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1266   PetscFunctionReturn(PETSC_SUCCESS);
1267 }
1268 
1269 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag)
1270 {
1271   Mat_SeqAIJKokkos          *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1272   MatScalarKokkosView        Aa;
1273   const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
1274   PetscInt                   m  = A->rmap->n;
1275   ConstMatRowMapKokkosView   Adiag(diag, m); /* diag is a device pointer */
1276 
1277   PetscFunctionBegin;
1278   PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa));
1279   Kokkos::parallel_for(
1280     m, KOKKOS_LAMBDA(const PetscInt i) {
1281       PetscScalar tmp;
1282       if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
1283         tmp          = Aa(Ai(i));
1284         Aa(Ai(i))    = Aa(Adiag(i));
1285         Aa(Adiag(i)) = tmp;
1286       }
1287     });
1288   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1289   PetscFunctionReturn(PETSC_SUCCESS);
1290 }
1291 
1292 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1293 {
1294   PetscFunctionBegin;
1295   PetscCall(MatSeqAIJKokkosSyncHost(A));
1296   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1297   B->offloadmask = PETSC_OFFLOAD_CPU;
1298   PetscFunctionReturn(PETSC_SUCCESS);
1299 }
1300 
1301 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1302 {
1303   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1304 
1305   PetscFunctionBegin;
1306   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1307   A->boundtocpu  = PETSC_FALSE;
1308 
1309   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1310   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1311   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1312   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1313   A->ops->scale                     = MatScale_SeqAIJKokkos;
1314   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1315   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1316   A->ops->mult                      = MatMult_SeqAIJKokkos;
1317   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1318   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1319   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1320   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1321   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1322   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1323   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1324   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1325   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1326   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1327   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1328   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1329   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1330   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1331   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1332   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1333 
1334   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1335   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1336   PetscFunctionReturn(PETSC_SUCCESS);
1337 }
1338 
1339 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1340 {
1341   Mat_SeqAIJ *aseq;
1342   PetscInt    i, m, n;
1343 
1344   PetscFunctionBegin;
1345   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1346 
1347   m = akok->nrows();
1348   n = akok->ncols();
1349   PetscCall(MatSetSizes(A, m, n, m, n));
1350   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1351 
1352   /* Set up data structures of A as a MATSEQAIJ */
1353   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1354   aseq = (Mat_SeqAIJ *)(A)->data;
1355 
1356   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1357   akok->j_dual.sync_host();
1358 
1359   aseq->i            = akok->i_host_data();
1360   aseq->j            = akok->j_host_data();
1361   aseq->a            = akok->a_host_data();
1362   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1363   aseq->singlemalloc = PETSC_FALSE;
1364   aseq->free_a       = PETSC_FALSE;
1365   aseq->free_ij      = PETSC_FALSE;
1366   aseq->nz           = akok->nnz();
1367   aseq->maxnz        = aseq->nz;
1368 
1369   PetscCall(PetscMalloc1(m, &aseq->imax));
1370   PetscCall(PetscMalloc1(m, &aseq->ilen));
1371   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1372 
1373   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1374   akok->nonzerostate = A->nonzerostate;
1375   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1376   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1377   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1378   PetscFunctionReturn(PETSC_SUCCESS);
1379 }
1380 
1381 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1382 {
1383   PetscFunctionBegin;
1384   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1385   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1386   PetscFunctionReturn(PETSC_SUCCESS);
1387 }
1388 
1389 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1390 {
1391   Mat_SeqAIJKokkos *akok;
1392   PetscFunctionBegin;
1393   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1394   PetscCall(MatCreate(comm, A));
1395   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1396   PetscFunctionReturn(PETSC_SUCCESS);
1397 }
1398 
1399 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1400 
1401    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1402  */
1403 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1404 {
1405   PetscFunctionBegin;
1406   PetscCall(MatCreate(comm, A));
1407   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1408   PetscFunctionReturn(PETSC_SUCCESS);
1409 }
1410 
1411 /*@C
1412    MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1413    (the default parallel PETSc format). This matrix will ultimately be handled by
1414    Kokkos for calculations. For good matrix
1415    assembly performance the user should preallocate the matrix storage by setting
1416    the parameter nz (or the array nnz).  By setting these parameters accurately,
1417    performance during matrix assembly can be increased by more than a factor of 50.
1418 
1419    Collective
1420 
1421    Input Parameters:
1422 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
1423 .  m - number of rows
1424 .  n - number of columns
1425 .  nz - number of nonzeros per row (same for all rows)
1426 -  nnz - array containing the number of nonzeros in the various rows
1427          (possibly different for each row) or `NULL`
1428 
1429    Output Parameter:
1430 .  A - the matrix
1431 
1432    Level: intermediate
1433 
1434    Notes:
1435    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1436    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1437    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1438 
1439    If `nnz` is given then `nz` is ignored
1440 
1441    The AIJ format, also called
1442    compressed row storage, is fully compatible with standard Fortran
1443    storage.  That is, the stored row and column indices can begin at
1444    either one (as in Fortran) or zero.  See the users' manual for details.
1445 
1446    Specify the preallocated storage with either `nz` or `nnz` (not both).
1447    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1448    allocation.
1449 
1450    By default, this format uses inodes (identical nodes) when possible, to
1451    improve numerical efficiency of matrix-vector products and solves. We
1452    search for consecutive rows with the same nonzero structure, thereby
1453    reusing matrix information to achieve increased efficiency.
1454 
1455 .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1456 @*/
1457 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1458 {
1459   PetscFunctionBegin;
1460   PetscCall(PetscKokkosInitializeCheck());
1461   PetscCall(MatCreate(comm, A));
1462   PetscCall(MatSetSizes(*A, m, n, m, n));
1463   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1464   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1465   PetscFunctionReturn(PETSC_SUCCESS);
1466 }
1467 
1468 typedef Kokkos::TeamPolicy<>::member_type team_member;
1469 //
1470 // This factorization exploits block diagonal matrices with "Nf" (not used).
1471 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1472 //
1473 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info)
1474 {
1475   Mat_SeqAIJ       *b      = (Mat_SeqAIJ *)B->data;
1476   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1477   IS                isrow = b->row, isicol = b->icol;
1478   const PetscInt   *r_h, *ic_h;
1479   const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1480   const PetscScalar *aa_d = aijkok->a_dual.view_device().data();
1481   PetscScalar       *ba_d = baijkok->a_dual.view_device().data();
1482   PetscBool          row_identity, col_identity;
1483   PetscInt           nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used
1484 
1485   PetscFunctionBegin;
1486   PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n);
1487   PetscCall(MatIsStructurallySymmetric(A, &row_identity));
1488   PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported");
1489   PetscCall(ISGetIndices(isrow, &r_h));
1490   PetscCall(ISGetIndices(isicol, &ic_h));
1491   PetscCall(ISGetSize(isicol, &nc));
1492   PetscCall(PetscLogGpuTimeBegin());
1493   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1494   {
1495 #define KOKKOS_SHARED_LEVEL 1
1496     using scr_mem_t    = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1497     using sizet_scr_t  = Kokkos::View<size_t, scr_mem_t>;
1498     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1499     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n);
1500     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_r_k("r", n);
1501     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc);
1502     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_ic_k("ic", nc);
1503     size_t                                                                                                               flops_h = 0.0;
1504     Kokkos::View<size_t, Kokkos::HostSpace>                                                                              h_flops_k(&flops_h);
1505     Kokkos::View<size_t>                                                                                                 d_flops_k("flops");
1506     const int                                                                                                            conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1507     const int                                                                                                            nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1;
1508     Kokkos::deep_copy(d_flops_k, h_flops_k);
1509     Kokkos::deep_copy(d_r_k, h_r_k);
1510     Kokkos::deep_copy(d_ic_k, h_ic_k);
1511     // Fill A --> fact
1512     Kokkos::parallel_for(
1513       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) {
1514         const PetscInt  field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA
1515         const PetscInt  nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
1516         const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data();
1517         // zero rows of B
1518         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1519           PetscInt     nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag
1520           PetscScalar *baL = ba_d + bi_d[rowb];
1521           PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1;
1522           /* zero (unfactored row) */
1523           for (int j = 0; j < nzbL; j++) baL[j] = 0;
1524           for (int j = 0; j < nzbU; j++) baU[j] = 0;
1525         });
1526         // copy A into B
1527         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1528           PetscInt           rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
1529           const PetscScalar *av    = aa_d + ai_d[rowa];
1530           const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1531           /* load in initial (unfactored row) */
1532           for (int j = 0; j < nza; j++) {
1533             PetscInt    colb = ic[ajtmp[j]];
1534             PetscScalar vala = av[j];
1535             if (colb == rowb) {
1536               *(ba_d + bdiag_d[rowb]) = vala;
1537             } else {
1538               const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1539               PetscScalar    *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1540               PetscInt        nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0;
1541               for (int j = 0; j < nz; j++) {
1542                 if (pbj[j] == colb) {
1543                   pba[j] = vala;
1544                   set++;
1545                   break;
1546                 }
1547               }
1548 #if !defined(PETSC_HAVE_SYCL)
1549               if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1550 #endif
1551             }
1552           }
1553         });
1554       });
1555     Kokkos::fence();
1556 
1557     Kokkos::parallel_for(
1558       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) {
1559         sizet_scr_t    colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1560         scalar_scr_t   L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1561         sizet_scr_t    flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1562         const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA
1563         const PetscInt start = field * nloc, end = start + nloc;
1564         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1565         // A22 panel update for each row A(1,:) and col A(:,1)
1566         for (int ii = start; ii < end - 1; ii++) {
1567           const PetscInt    *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end)
1568           const PetscScalar *baUi    = ba_d + bdiag_d[ii + 1] + 1;                                          // vector of data  U(i,i+1:end)
1569           const PetscInt     nUi_its = nzUi / Ni + !!(nzUi % Ni);
1570           const PetscScalar  Bii     = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1571           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) {
1572             PetscInt kIdx = j * Ni + field_block_idx;
1573             if (kIdx >= nzUi) /* void */
1574               ;
1575             else {
1576               const PetscInt  myk = bjUi[kIdx];                // assume symmetric structure, need a transposed meta-data here in general
1577               const PetscInt *pjL = bj_d + bi_d[myk];          // look for L(myk,ii) in start of row
1578               const PetscInt  nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:)
1579               size_t          st_idx;
1580               // find and do L(k,i) = A(:k,i) / A(i,i)
1581               Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1582               // get column, there has got to be a better way
1583               Kokkos::parallel_reduce(
1584                 Kokkos::ThreadVectorRange(team, nzL),
1585                 [&](const int &j, size_t &idx) {
1586                   if (pjL[j] == ii) {
1587                     PetscScalar *pLki = ba_d + bi_d[myk] + j;
1588                     idx               = j;           // output
1589                     *pLki             = *pLki / Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1590                   }
1591                 },
1592                 st_idx);
1593               Kokkos::single(Kokkos::PerThread(team), [=]() {
1594                 colkIdx() = st_idx;
1595                 L_ki()    = *(ba_d + bi_d[myk] + st_idx);
1596               });
1597 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1598               if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register
1599 #endif
1600               // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1601               // U(i+1,:end)
1602               Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U)
1603                 PetscScalar Uij = baUi[uiIdx];
1604                 PetscInt    col = bjUi[uiIdx];
1605                 if (col == myk) {
1606                   // A_kk = A_kk - L_ki * U_ij(k)
1607                   PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1608                   *Akkv             = *Akkv - L_ki() * Uij;  // UiK
1609                 } else {
1610                   PetscScalar    *start, *end, *pAkjv = NULL;
1611                   PetscInt        high, low;
1612                   const PetscInt *startj;
1613                   if (col < myk) { // L
1614                     PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1615                     PetscInt     idx  = (pLki + 1) - (ba_d + bi_d[myk]); // index into row
1616                     start             = pLki + 1;                        // start at pLki+1, A22(myk,1)
1617                     startj            = bj_d + bi_d[myk] + idx;
1618                     end               = ba_d + bi_d[myk + 1];
1619                   } else {
1620                     PetscInt idx = bdiag_d[myk + 1] + 1;
1621                     start        = ba_d + idx;
1622                     startj       = bj_d + idx;
1623                     end          = ba_d + bdiag_d[myk];
1624                   }
1625                   // search for 'col', use bisection search - TODO
1626                   low  = 0;
1627                   high = (PetscInt)(end - start);
1628                   while (high - low > 5) {
1629                     int t = (low + high) / 2;
1630                     if (startj[t] > col) high = t;
1631                     else low = t;
1632                   }
1633                   for (pAkjv = start + low; pAkjv < start + high; pAkjv++) {
1634                     if (startj[pAkjv - start] == col) break;
1635                   }
1636 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1637                   if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register
1638 #endif
1639                   *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1640                 }
1641               });
1642             }
1643           });
1644           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1645           if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); });
1646         } /* endof for (i=0; i<n; i++) { */
1647         Kokkos::single(Kokkos::PerTeam(team), [=]() {
1648           Kokkos::atomic_add(&d_flops_k(), flops());
1649           flops() = 0;
1650         });
1651       });
1652     Kokkos::fence();
1653     Kokkos::deep_copy(h_flops_k, d_flops_k);
1654     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1655     Kokkos::parallel_for(
1656       Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) {
1657         const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni;                            //, field_offset = lg_rank%Ni;
1658         const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters
1659         /* Invert diagonal for simpler triangular solves */
1660         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) {
1661           int i = start + outer_index * Ni + lg_rank % Ni;
1662           if (i < end) {
1663             PetscScalar *pv = ba_d + bdiag_d[i];
1664             *pv             = 1.0 / (*pv);
1665           }
1666         });
1667       });
1668   }
1669   PetscCall(PetscLogGpuTimeEnd());
1670   PetscCall(ISRestoreIndices(isicol, &ic_h));
1671   PetscCall(ISRestoreIndices(isrow, &r_h));
1672 
1673   PetscCall(ISIdentity(isrow, &row_identity));
1674   PetscCall(ISIdentity(isicol, &col_identity));
1675   if (b->inode.size) {
1676     B->ops->solve = MatSolve_SeqAIJ_Inode;
1677   } else if (row_identity && col_identity) {
1678     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1679   } else {
1680     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1681   }
1682   B->offloadmask = PETSC_OFFLOAD_GPU;
1683   PetscCall(MatSeqAIJKokkosSyncHost(B));          // solve on CPU
1684   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1685   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1686   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1687   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1688   B->assembled              = PETSC_TRUE;
1689   B->preallocated           = PETSC_TRUE;
1690 
1691   PetscFunctionReturn(PETSC_SUCCESS);
1692 }
1693 
1694 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1695 {
1696   PetscFunctionBegin;
1697   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1698   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1699   PetscFunctionReturn(PETSC_SUCCESS);
1700 }
1701 
1702 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1703 {
1704   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1705 
1706   PetscFunctionBegin;
1707   if (!factors->sptrsv_symbolic_completed) {
1708     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1709     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1710     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1711   }
1712   PetscFunctionReturn(PETSC_SUCCESS);
1713 }
1714 
1715 /* Check if we need to update factors etc for transpose solve */
1716 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1717 {
1718   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1719   MatColIdxType               n       = A->rmap->n;
1720 
1721   PetscFunctionBegin;
1722   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1723     /* Update L^T and do sptrsv symbolic */
1724     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1);
1725     Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */
1726     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0));
1727     factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0));
1728 
1729     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1730                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1731 
1732     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1733       We have to sort the indices, until KK provides finer control options.
1734     */
1735     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1736 
1737     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1738 
1739     /* Update U^T and do sptrsv symbolic */
1740     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1);
1741     Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */
1742     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0));
1743     factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0));
1744 
1745     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1746                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1747 
1748     /* Sort indices. See comments above */
1749     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1750 
1751     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1752     factors->transpose_updated = PETSC_TRUE;
1753   }
1754   PetscFunctionReturn(PETSC_SUCCESS);
1755 }
1756 
1757 /* Solve Ax = b, with A = LU */
1758 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1759 {
1760   ConstPetscScalarKokkosView  bv;
1761   PetscScalarKokkosView       xv;
1762   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1763 
1764   PetscFunctionBegin;
1765   PetscCall(PetscLogGpuTimeBegin());
1766   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1767   PetscCall(VecGetKokkosView(b, &bv));
1768   PetscCall(VecGetKokkosViewWrite(x, &xv));
1769   /* Solve L tmpv = b */
1770   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1771   /* Solve Ux = tmpv */
1772   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1773   PetscCall(VecRestoreKokkosView(b, &bv));
1774   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1775   PetscCall(PetscLogGpuTimeEnd());
1776   PetscFunctionReturn(PETSC_SUCCESS);
1777 }
1778 
1779 /* Solve A^T x = b, where A^T = U^T L^T */
1780 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1781 {
1782   ConstPetscScalarKokkosView  bv;
1783   PetscScalarKokkosView       xv;
1784   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1785 
1786   PetscFunctionBegin;
1787   PetscCall(PetscLogGpuTimeBegin());
1788   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1789   PetscCall(VecGetKokkosView(b, &bv));
1790   PetscCall(VecGetKokkosViewWrite(x, &xv));
1791   /* Solve U^T tmpv = b */
1792   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1793 
1794   /* Solve L^T x = tmpv */
1795   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1796   PetscCall(VecRestoreKokkosView(b, &bv));
1797   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1798   PetscCall(PetscLogGpuTimeEnd());
1799   PetscFunctionReturn(PETSC_SUCCESS);
1800 }
1801 
1802 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1803 {
1804   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1805   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1806   PetscInt                    fill_lev = info->levels;
1807 
1808   PetscFunctionBegin;
1809   PetscCall(PetscLogGpuTimeBegin());
1810   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1811 
1812   auto a_d = aijkok->a_dual.view_device();
1813   auto i_d = aijkok->i_dual.view_device();
1814   auto j_d = aijkok->j_dual.view_device();
1815 
1816   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1817 
1818   B->assembled              = PETSC_TRUE;
1819   B->preallocated           = PETSC_TRUE;
1820   B->ops->solve             = MatSolve_SeqAIJKokkos;
1821   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1822   B->ops->matsolve          = NULL;
1823   B->ops->matsolvetranspose = NULL;
1824   B->offloadmask            = PETSC_OFFLOAD_GPU;
1825 
1826   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1827   factors->transpose_updated         = PETSC_FALSE;
1828   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1829   /* TODO: log flops, but how to know that? */
1830   PetscCall(PetscLogGpuTimeEnd());
1831   PetscFunctionReturn(PETSC_SUCCESS);
1832 }
1833 
1834 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1835 {
1836   Mat_SeqAIJKokkos           *aijkok;
1837   Mat_SeqAIJ                 *b;
1838   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1839   PetscInt                    fill_lev = info->levels;
1840   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1841   PetscInt                    n        = A->rmap->n;
1842 
1843   PetscFunctionBegin;
1844   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1845   /* Rebuild factors */
1846   if (factors) {
1847     factors->Destroy();
1848   } /* Destroy the old if it exists */
1849   else {
1850     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1851   }
1852 
1853   /* Create a spiluk handle and then do symbolic factorization */
1854   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1855   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1856 
1857   auto spiluk_handle = factors->kh.get_spiluk_handle();
1858 
1859   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1860   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1861   Kokkos::realloc(factors->iU_d, n + 1);
1862   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1863 
1864   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1865   auto i_d = aijkok->i_dual.view_device();
1866   auto j_d = aijkok->j_dual.view_device();
1867   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1868   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1869 
1870   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1871   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1872   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1873   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1874 
1875   /* TODO: add options to select sptrsv algorithms */
1876   /* Create sptrsv handles for L, U and their transpose */
1877 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1878   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1879 #else
1880   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1881 #endif
1882 
1883   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1884   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1885   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1886   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1887 
1888   /* Fill fields of the factor matrix B */
1889   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1890   b     = (Mat_SeqAIJ *)B->data;
1891   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1892   B->info.fill_ratio_given  = info->fill;
1893   B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA);
1894 
1895   B->offloadmask          = PETSC_OFFLOAD_GPU;
1896   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1897   PetscFunctionReturn(PETSC_SUCCESS);
1898 }
1899 
1900 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1901 {
1902   Mat_SeqAIJ    *b     = (Mat_SeqAIJ *)B->data;
1903   const PetscInt nrows = A->rmap->n;
1904 
1905   PetscFunctionBegin;
1906   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1907   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1908   // move B data into Kokkos
1909   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1910   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1911   {
1912     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1913     if (!baijkok->diag_d.extent(0)) {
1914       const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1);
1915       baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag));
1916       Kokkos::deep_copy(baijkok->diag_d, h_diag);
1917     }
1918   }
1919   PetscFunctionReturn(PETSC_SUCCESS);
1920 }
1921 
1922 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1923 {
1924   PetscFunctionBegin;
1925   *type = MATSOLVERKOKKOS;
1926   PetscFunctionReturn(PETSC_SUCCESS);
1927 }
1928 
1929 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type)
1930 {
1931   PetscFunctionBegin;
1932   *type = MATSOLVERKOKKOSDEVICE;
1933   PetscFunctionReturn(PETSC_SUCCESS);
1934 }
1935 
1936 /*MC
1937   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1938   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1939 
1940   Level: beginner
1941 
1942 .seealso: [](chapter_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1943 M*/
1944 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1945 {
1946   PetscInt n = A->rmap->n;
1947 
1948   PetscFunctionBegin;
1949   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1950   PetscCall(MatSetSizes(*B, n, n, n, n));
1951   (*B)->factortype = ftype;
1952   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1953   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1954 
1955   if (ftype == MAT_FACTOR_LU) {
1956     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1957     (*B)->canuseordering        = PETSC_TRUE;
1958     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1959   } else if (ftype == MAT_FACTOR_ILU) {
1960     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1961     (*B)->canuseordering         = PETSC_FALSE;
1962     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1963   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1964 
1965   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1966   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1967   PetscFunctionReturn(PETSC_SUCCESS);
1968 }
1969 
1970 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B)
1971 {
1972   PetscInt n = A->rmap->n;
1973 
1974   PetscFunctionBegin;
1975   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1976   PetscCall(MatSetSizes(*B, n, n, n, n));
1977   (*B)->factortype     = ftype;
1978   (*B)->canuseordering = PETSC_TRUE;
1979   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1980   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1981 
1982   if (ftype == MAT_FACTOR_LU) {
1983     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1984     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1985   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types");
1986 
1987   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1988   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device));
1989   PetscFunctionReturn(PETSC_SUCCESS);
1990 }
1991 
1992 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1993 {
1994   PetscFunctionBegin;
1995   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1996   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1997   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device));
1998   PetscFunctionReturn(PETSC_SUCCESS);
1999 }
2000 
2001 /* Utility to print out a KokkosCsrMatrix for debugging */
2002 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2003 {
2004   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
2005   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
2006   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
2007   const PetscInt    *i  = iv.data();
2008   const PetscInt    *j  = jv.data();
2009   const PetscScalar *a  = av.data();
2010   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2011 
2012   PetscFunctionBegin;
2013   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2014   for (PetscInt k = 0; k < m; k++) {
2015     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2016     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2017     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2018   }
2019   PetscFunctionReturn(PETSC_SUCCESS);
2020 }
2021