xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision c0c276a7a9f347b22187dda26ae7d35d5b9ed8a2)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petscsystypes.h>
8 #include <petscerror.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20 #include <KokkosSparse_spmv.hpp>
21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22 
23 #include <KokkosSparse_spiluk.hpp>
24 #include <KokkosSparse_sptrsv.hpp>
25 #include <KokkosSparse_spgemm.hpp>
26 #include <KokkosSparse_spadd.hpp>
27 #include <KokkosBatched_LU_Decl.hpp>
28 #include <KokkosBatched_InverseLU_Decl.hpp>
29 
30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31 
32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33   #include <KokkosSparse_Utils.hpp>
34 using KokkosSparse::sort_crs_matrix;
35 using KokkosSparse::Impl::transpose_matrix;
36 #else
37   #include <KokkosKernels_Sorting.hpp>
38 using KokkosKernels::sort_crs_matrix;
39 using KokkosKernels::Impl::transpose_matrix;
40 #endif
41 
42 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
43 
44 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
45    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
46    In the latter case, it is important to set a_dual's sync state correctly.
47  */
48 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
49 {
50   Mat_SeqAIJ       *aijseq;
51   Mat_SeqAIJKokkos *aijkok;
52 
53   PetscFunctionBegin;
54   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
55   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
56 
57   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
58   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
59 
60   /* If aijkok does not exist, we just copy i, j to device.
61      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
62      In both cases, we build a new aijkok structure.
63   */
64   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
65     delete aijkok;
66     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
67     A->spptr = aijkok;
68   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
69     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
70     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
71     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
72   }
73   PetscFunctionReturn(PETSC_SUCCESS);
74 }
75 
76 /* Sync CSR data to device if not yet */
77 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
78 {
79   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
80 
81   PetscFunctionBegin;
82   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
83   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
84   if (aijkok->a_dual.need_sync_device()) {
85     aijkok->a_dual.sync_device();
86     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
87     aijkok->hermitian_updated = PETSC_FALSE;
88   }
89   PetscFunctionReturn(PETSC_SUCCESS);
90 }
91 
92 /* Mark the CSR data on device as modified */
93 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
94 {
95   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
96 
97   PetscFunctionBegin;
98   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
99   aijkok->a_dual.clear_sync_state();
100   aijkok->a_dual.modify_device();
101   aijkok->transpose_updated = PETSC_FALSE;
102   aijkok->hermitian_updated = PETSC_FALSE;
103   PetscCall(MatSeqAIJInvalidateDiagonal(A));
104   PetscCall(PetscObjectStateIncrease((PetscObject)A));
105   PetscFunctionReturn(PETSC_SUCCESS);
106 }
107 
108 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111   auto              exec   = PetscGetKokkosExecutionSpace();
112 
113   PetscFunctionBegin;
114   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
115   /* We do not expect one needs factors on host  */
116   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
117   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
118   PetscCallCXX(aijkok->a_dual.sync_host(exec));
119   PetscCallCXX(exec.fence());
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
124 {
125   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
126 
127   PetscFunctionBegin;
128   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
129     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
130     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
131     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
132   */
133   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
134     auto exec = PetscGetKokkosExecutionSpace();
135     PetscCallCXX(aijkok->a_dual.sync_host(exec));
136     PetscCallCXX(exec.fence());
137     *array = aijkok->a_dual.view_host().data();
138   } else { /* Happens when calling MatSetValues on a newly created matrix */
139     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
140   }
141   PetscFunctionReturn(PETSC_SUCCESS);
142 }
143 
144 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
145 {
146   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
147 
148   PetscFunctionBegin;
149   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
150   PetscFunctionReturn(PETSC_SUCCESS);
151 }
152 
153 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
154 {
155   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
156 
157   PetscFunctionBegin;
158   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
159     auto exec = PetscGetKokkosExecutionSpace();
160     PetscCallCXX(aijkok->a_dual.sync_host(exec));
161     PetscCallCXX(exec.fence());
162     *array = aijkok->a_dual.view_host().data();
163   } else {
164     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
165   }
166   PetscFunctionReturn(PETSC_SUCCESS);
167 }
168 
169 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
170 {
171   PetscFunctionBegin;
172   *array = NULL;
173   PetscFunctionReturn(PETSC_SUCCESS);
174 }
175 
176 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
177 {
178   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
179 
180   PetscFunctionBegin;
181   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
182     *array = aijkok->a_dual.view_host().data();
183   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
184     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
185   }
186   PetscFunctionReturn(PETSC_SUCCESS);
187 }
188 
189 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
190 {
191   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
192 
193   PetscFunctionBegin;
194   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
195     aijkok->a_dual.clear_sync_state();
196     aijkok->a_dual.modify_host();
197   }
198   PetscFunctionReturn(PETSC_SUCCESS);
199 }
200 
201 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
202 {
203   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
204 
205   PetscFunctionBegin;
206   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
207 
208   if (i) *i = aijkok->i_device_data();
209   if (j) *j = aijkok->j_device_data();
210   if (a) {
211     aijkok->a_dual.sync_device();
212     *a = aijkok->a_device_data();
213   }
214   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
215   PetscFunctionReturn(PETSC_SUCCESS);
216 }
217 
218 /*
219   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
220 
221   Input Parameter:
222 .  A       - the MATSEQAIJKOKKOS matrix
223 
224   Output Parameters:
225 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
226 -  T_d    - the transpose on device, whose value array is allocated but not initialized
227 */
228 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
229 {
230   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
231   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
232   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
233   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
234   MatRowMapType          *Ti = Ti_h.data();
235   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
236   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
237   PetscInt               *Tj   = Tj_h.data();
238   PetscInt               *perm = perm_h.data();
239   PetscInt               *offset;
240 
241   PetscFunctionBegin;
242   // Populate Ti
243   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
244   Ti++;
245   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
246   Ti--;
247   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
248 
249   // Populate Tj and the permutation array
250   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
251   for (PetscInt i = 0; i < m; i++) {
252     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
253       PetscInt r    = Aj[j];                       // row r of T
254       PetscInt disp = Ti[r] + offset[r];
255 
256       Tj[disp]   = i; // col i of T
257       perm[disp] = j;
258       offset[r]++;
259     }
260   }
261   PetscCall(PetscFree(offset));
262 
263   // Sort each row of T, along with the permutation array
264   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
265 
266   // Output perm and T on device
267   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
268   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
269   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
270   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
271   PetscFunctionReturn(PETSC_SUCCESS);
272 }
273 
274 // Generate the transpose on device and cache it internally
275 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
276 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
277 {
278   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
279   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
280   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
281   KokkosCsrMatrix  &T = akok->csrmatT;
282 
283   PetscFunctionBegin;
284   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
285   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
286 
287   const auto &Aa = akok->a_dual.view_device();
288 
289   if (A->symmetric == PETSC_BOOL3_TRUE) {
290     *csrmatT = akok->csrmat;
291   } else {
292     // See if we already have a cached transpose and its value is up to date
293     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
294       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
295         const auto &perm = akok->transpose_perm; // get the permutation array
296         auto       &Ta   = T.values;
297 
298         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
299       }
300     } else { // Generate T of size n x m for the first time
301       MatRowMapKokkosView perm;
302 
303       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
304       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
305       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
306     }
307     akok->transpose_updated = PETSC_TRUE;
308     *csrmatT                = akok->csrmatT;
309   }
310   PetscFunctionReturn(PETSC_SUCCESS);
311 }
312 
313 // Generate the Hermitian on device and cache it internally
314 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
315 {
316   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
317   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
318   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
319   KokkosCsrMatrix  &T = akok->csrmatH;
320 
321   PetscFunctionBegin;
322   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
323   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
324 
325   const auto &Aa = akok->a_dual.view_device();
326 
327   if (A->hermitian == PETSC_BOOL3_TRUE) {
328     *csrmatH = akok->csrmat;
329   } else {
330     // See if we already have a cached hermitian and its value is up to date
331     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
332       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
333         const auto &perm = akok->transpose_perm; // get the permutation array
334         auto       &Ta   = T.values;
335 
336         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
337       }
338     } else { // Generate T of size n x m for the first time
339       MatRowMapKokkosView perm;
340 
341       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
342       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
343       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
344     }
345     akok->hermitian_updated = PETSC_TRUE;
346     *csrmatH                = akok->csrmatH;
347   }
348   PetscFunctionReturn(PETSC_SUCCESS);
349 }
350 
351 /* y = A x */
352 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
353 {
354   Mat_SeqAIJKokkos          *aijkok;
355   ConstPetscScalarKokkosView xv;
356   PetscScalarKokkosView      yv;
357 
358   PetscFunctionBegin;
359   PetscCall(PetscLogGpuTimeBegin());
360   PetscCall(MatSeqAIJKokkosSyncDevice(A));
361   PetscCall(VecGetKokkosView(xx, &xv));
362   PetscCall(VecGetKokkosViewWrite(yy, &yv));
363   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
364   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
365   PetscCall(VecRestoreKokkosView(xx, &xv));
366   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
367   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
368   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
369   PetscCall(PetscLogGpuTimeEnd());
370   PetscFunctionReturn(PETSC_SUCCESS);
371 }
372 
373 /* y = A^T x */
374 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
375 {
376   Mat_SeqAIJKokkos          *aijkok;
377   const char                *mode;
378   ConstPetscScalarKokkosView xv;
379   PetscScalarKokkosView      yv;
380   KokkosCsrMatrix            csrmat;
381 
382   PetscFunctionBegin;
383   PetscCall(PetscLogGpuTimeBegin());
384   PetscCall(MatSeqAIJKokkosSyncDevice(A));
385   PetscCall(VecGetKokkosView(xx, &xv));
386   PetscCall(VecGetKokkosViewWrite(yy, &yv));
387   if (A->form_explicit_transpose) {
388     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
389     mode = "N";
390   } else {
391     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
392     csrmat = aijkok->csrmat;
393     mode   = "T";
394   }
395   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
396   PetscCall(VecRestoreKokkosView(xx, &xv));
397   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
398   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
399   PetscCall(PetscLogGpuTimeEnd());
400   PetscFunctionReturn(PETSC_SUCCESS);
401 }
402 
403 /* y = A^H x */
404 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
405 {
406   Mat_SeqAIJKokkos          *aijkok;
407   const char                *mode;
408   ConstPetscScalarKokkosView xv;
409   PetscScalarKokkosView      yv;
410   KokkosCsrMatrix            csrmat;
411 
412   PetscFunctionBegin;
413   PetscCall(PetscLogGpuTimeBegin());
414   PetscCall(MatSeqAIJKokkosSyncDevice(A));
415   PetscCall(VecGetKokkosView(xx, &xv));
416   PetscCall(VecGetKokkosViewWrite(yy, &yv));
417   if (A->form_explicit_transpose) {
418     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
419     mode = "N";
420   } else {
421     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
422     csrmat = aijkok->csrmat;
423     mode   = "C";
424   }
425   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
426   PetscCall(VecRestoreKokkosView(xx, &xv));
427   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
428   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
429   PetscCall(PetscLogGpuTimeEnd());
430   PetscFunctionReturn(PETSC_SUCCESS);
431 }
432 
433 /* z = A x + y */
434 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
435 {
436   Mat_SeqAIJKokkos          *aijkok;
437   ConstPetscScalarKokkosView xv;
438   PetscScalarKokkosView      zv;
439 
440   PetscFunctionBegin;
441   PetscCall(PetscLogGpuTimeBegin());
442   PetscCall(MatSeqAIJKokkosSyncDevice(A));
443   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
444   PetscCall(VecGetKokkosView(xx, &xv));
445   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
446   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
447   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
448   PetscCall(VecRestoreKokkosView(xx, &xv));
449   PetscCall(VecRestoreKokkosView(zz, &zv));
450   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
451   PetscCall(PetscLogGpuTimeEnd());
452   PetscFunctionReturn(PETSC_SUCCESS);
453 }
454 
455 /* z = A^T x + y */
456 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
457 {
458   Mat_SeqAIJKokkos          *aijkok;
459   const char                *mode;
460   ConstPetscScalarKokkosView xv;
461   PetscScalarKokkosView      zv;
462   KokkosCsrMatrix            csrmat;
463 
464   PetscFunctionBegin;
465   PetscCall(PetscLogGpuTimeBegin());
466   PetscCall(MatSeqAIJKokkosSyncDevice(A));
467   if (zz != yy) PetscCall(VecCopy(yy, zz));
468   PetscCall(VecGetKokkosView(xx, &xv));
469   PetscCall(VecGetKokkosView(zz, &zv));
470   if (A->form_explicit_transpose) {
471     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
472     mode = "N";
473   } else {
474     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
475     csrmat = aijkok->csrmat;
476     mode   = "T";
477   }
478   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
479   PetscCall(VecRestoreKokkosView(xx, &xv));
480   PetscCall(VecRestoreKokkosView(zz, &zv));
481   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
482   PetscCall(PetscLogGpuTimeEnd());
483   PetscFunctionReturn(PETSC_SUCCESS);
484 }
485 
486 /* z = A^H x + y */
487 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
488 {
489   Mat_SeqAIJKokkos          *aijkok;
490   const char                *mode;
491   ConstPetscScalarKokkosView xv;
492   PetscScalarKokkosView      zv;
493   KokkosCsrMatrix            csrmat;
494 
495   PetscFunctionBegin;
496   PetscCall(PetscLogGpuTimeBegin());
497   PetscCall(MatSeqAIJKokkosSyncDevice(A));
498   if (zz != yy) PetscCall(VecCopy(yy, zz));
499   PetscCall(VecGetKokkosView(xx, &xv));
500   PetscCall(VecGetKokkosView(zz, &zv));
501   if (A->form_explicit_transpose) {
502     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
503     mode = "N";
504   } else {
505     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
506     csrmat = aijkok->csrmat;
507     mode   = "C";
508   }
509   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
510   PetscCall(VecRestoreKokkosView(xx, &xv));
511   PetscCall(VecRestoreKokkosView(zz, &zv));
512   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
513   PetscCall(PetscLogGpuTimeEnd());
514   PetscFunctionReturn(PETSC_SUCCESS);
515 }
516 
517 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
518 {
519   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
520 
521   PetscFunctionBegin;
522   switch (op) {
523   case MAT_FORM_EXPLICIT_TRANSPOSE:
524     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
525     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
526     A->form_explicit_transpose = flg;
527     break;
528   default:
529     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
530     break;
531   }
532   PetscFunctionReturn(PETSC_SUCCESS);
533 }
534 
535 /* Depending on reuse, either build a new mat, or use the existing mat */
536 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
537 {
538   Mat_SeqAIJ *aseq;
539 
540   PetscFunctionBegin;
541   PetscCall(PetscKokkosInitializeCheck());
542   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
543     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
544   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
545     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
546   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
547     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
548     PetscCall(PetscFree(A->defaultvectype));
549     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
550     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
551     PetscCall(MatSetOps_SeqAIJKokkos(A));
552     aseq = static_cast<Mat_SeqAIJ *>(A->data);
553     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
554       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
555       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
556     }
557   }
558   PetscFunctionReturn(PETSC_SUCCESS);
559 }
560 
561 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
562    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
563  */
564 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
565 {
566   Mat_SeqAIJ       *bseq;
567   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
568   Mat               mat;
569 
570   PetscFunctionBegin;
571   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
572   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
573   mat = *B;
574   if (A->assembled) {
575     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
576     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
577     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
578     /* Now copy values to B if needed */
579     if (dupOption == MAT_COPY_VALUES) {
580       if (akok->a_dual.need_sync_device()) {
581         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
582         bkok->a_dual.modify_host();
583       } else { /* If device has the latest data, we only copy data on device */
584         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
585         bkok->a_dual.modify_device();
586       }
587     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
588       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
589       bkok->a_dual.modify_host();
590     }
591     mat->spptr = bkok;
592   }
593 
594   PetscCall(PetscFree(mat->defaultvectype));
595   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
596   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
597   PetscCall(MatSetOps_SeqAIJKokkos(mat));
598   PetscFunctionReturn(PETSC_SUCCESS);
599 }
600 
601 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
602 {
603   Mat               At;
604   KokkosCsrMatrix   internT;
605   Mat_SeqAIJKokkos *atkok, *bkok;
606 
607   PetscFunctionBegin;
608   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
609   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
610   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
611     /* Deep copy internT, as we want to isolate the internal transpose */
612     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
613     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
614     if (reuse == MAT_INITIAL_MATRIX) *B = At;
615     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
616   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
617     if ((*B)->assembled) {
618       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
619       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
620       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
621     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
622       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
623       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
624       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
625       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
626       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
627     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
628   }
629   PetscFunctionReturn(PETSC_SUCCESS);
630 }
631 
632 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
633 {
634   Mat_SeqAIJKokkos *aijkok;
635 
636   PetscFunctionBegin;
637   if (A->factortype == MAT_FACTOR_NONE) {
638     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
639     delete aijkok;
640   } else {
641     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
642   }
643   A->spptr = NULL;
644   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
645   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
646   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
647 #if defined(PETSC_HAVE_HYPRE)
648   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
649 #endif
650   PetscCall(MatDestroy_SeqAIJ(A));
651   PetscFunctionReturn(PETSC_SUCCESS);
652 }
653 
654 /*MC
655    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
656 
657    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
658 
659    Options Database Key:
660 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
661 
662   Level: beginner
663 
664 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
665 M*/
666 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
667 {
668   PetscFunctionBegin;
669   PetscCall(PetscKokkosInitializeCheck());
670   PetscCall(MatCreate_SeqAIJ(A));
671   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
672   PetscFunctionReturn(PETSC_SUCCESS);
673 }
674 
675 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
676 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
677 {
678   Mat_SeqAIJ         *a, *b;
679   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
680   MatScalarKokkosView aa, ba, ca;
681   MatRowMapKokkosView ai, bi, ci;
682   MatColIdxKokkosView aj, bj, cj;
683   PetscInt            m, n, nnz, aN;
684 
685   PetscFunctionBegin;
686   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
687   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
688   PetscAssertPointer(C, 4);
689   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
690   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
691   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
692   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
693 
694   PetscCall(MatSeqAIJKokkosSyncDevice(A));
695   PetscCall(MatSeqAIJKokkosSyncDevice(B));
696   a    = static_cast<Mat_SeqAIJ *>(A->data);
697   b    = static_cast<Mat_SeqAIJ *>(B->data);
698   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
699   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
700   aa   = akok->a_dual.view_device();
701   ai   = akok->i_dual.view_device();
702   ba   = bkok->a_dual.view_device();
703   bi   = bkok->i_dual.view_device();
704   m    = A->rmap->n; /* M, N and nnz of C */
705   n    = A->cmap->n + B->cmap->n;
706   nnz  = a->nz + b->nz;
707   aN   = A->cmap->n; /* N of A */
708   if (reuse == MAT_INITIAL_MATRIX) {
709     aj           = akok->j_dual.view_device();
710     bj           = bkok->j_dual.view_device();
711     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
712     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
713     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
714     ca           = ca_dual.view_device();
715     ci           = ci_dual.view_device();
716     cj           = cj_dual.view_device();
717 
718     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
719     Kokkos::parallel_for(
720       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
721         PetscInt i       = t.league_rank(); /* row i */
722         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
723 
724         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
725                                                    ci(i) = coffset;
726                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
727         });
728 
729         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
730           if (k < alen) {
731             ca(coffset + k) = aa(ai(i) + k);
732             cj(coffset + k) = aj(ai(i) + k);
733           } else {
734             ca(coffset + k) = ba(bi(i) + k - alen);
735             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
736           }
737         });
738       });
739     ca_dual.modify_device();
740     ci_dual.modify_device();
741     cj_dual.modify_device();
742     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
743     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
744   } else if (reuse == MAT_REUSE_MATRIX) {
745     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
746     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
747     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
748     ca   = ckok->a_dual.view_device();
749     ci   = ckok->i_dual.view_device();
750 
751     Kokkos::parallel_for(
752       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
753         PetscInt i    = t.league_rank(); /* row i */
754         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
755         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
756           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
757           else ca(ci(i) + k) = ba(bi(i) + k - alen);
758         });
759       });
760     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
761   }
762   PetscFunctionReturn(PETSC_SUCCESS);
763 }
764 
765 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
766 {
767   PetscFunctionBegin;
768   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
769   PetscFunctionReturn(PETSC_SUCCESS);
770 }
771 
772 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
773 {
774   Mat_Product                 *product = C->product;
775   Mat                          A, B;
776   bool                         transA, transB; /* use bool, since KK needs this type */
777   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
778   Mat_SeqAIJ                  *c;
779   MatProductData_SeqAIJKokkos *pdata;
780   KokkosCsrMatrix              csrmatA, csrmatB;
781 
782   PetscFunctionBegin;
783   MatCheckProduct(C, 1);
784   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
785   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
786 
787   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
788   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
789   // we still do numeric.
790   if (pdata->reusesym) { // numeric reuses results from symbolic
791     pdata->reusesym = PETSC_FALSE;
792     PetscFunctionReturn(PETSC_SUCCESS);
793   }
794 
795   switch (product->type) {
796   case MATPRODUCT_AB:
797     transA = false;
798     transB = false;
799     break;
800   case MATPRODUCT_AtB:
801     transA = true;
802     transB = false;
803     break;
804   case MATPRODUCT_ABt:
805     transA = false;
806     transB = true;
807     break;
808   default:
809     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
810   }
811 
812   A = product->A;
813   B = product->B;
814   PetscCall(MatSeqAIJKokkosSyncDevice(A));
815   PetscCall(MatSeqAIJKokkosSyncDevice(B));
816   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
817   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
818   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
819 
820   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
821 
822   csrmatA = akok->csrmat;
823   csrmatB = bkok->csrmat;
824 
825   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
826   if (transA) {
827     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
828     transA = false;
829   }
830 
831   if (transB) {
832     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
833     transB = false;
834   }
835   PetscCall(PetscLogGpuTimeBegin());
836   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
837 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
838   auto spgemmHandle = pdata->kh.get_spgemm_handle();
839   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
840 #endif
841 
842   PetscCall(PetscLogGpuTimeEnd());
843   PetscCall(MatSeqAIJKokkosModifyDevice(C));
844   /* shorter version of MatAssemblyEnd_SeqAIJ */
845   c = (Mat_SeqAIJ *)C->data;
846   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
847   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
848   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
849   c->reallocs         = 0;
850   C->info.mallocs     = 0;
851   C->info.nz_unneeded = 0;
852   C->assembled = C->was_assembled = PETSC_TRUE;
853   C->num_ass++;
854   PetscFunctionReturn(PETSC_SUCCESS);
855 }
856 
857 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
858 {
859   Mat_Product                 *product = C->product;
860   MatProductType               ptype;
861   Mat                          A, B;
862   bool                         transA, transB;
863   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
864   MatProductData_SeqAIJKokkos *pdata;
865   MPI_Comm                     comm;
866   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
867 
868   PetscFunctionBegin;
869   MatCheckProduct(C, 1);
870   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
871   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
872   A = product->A;
873   B = product->B;
874   PetscCall(MatSeqAIJKokkosSyncDevice(A));
875   PetscCall(MatSeqAIJKokkosSyncDevice(B));
876   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
877   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
878   csrmatA = akok->csrmat;
879   csrmatB = bkok->csrmat;
880 
881   ptype = product->type;
882   // Take advantage of the symmetry if true
883   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
884     ptype                                          = MATPRODUCT_AB;
885     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
886   }
887   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
888     ptype                                          = MATPRODUCT_AB;
889     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
890   }
891 
892   switch (ptype) {
893   case MATPRODUCT_AB:
894     transA = false;
895     transB = false;
896     PetscCall(MatSetBlockSizesFromMats(C, A, B));
897     break;
898   case MATPRODUCT_AtB:
899     transA = true;
900     transB = false;
901     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
902     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
903     break;
904   case MATPRODUCT_ABt:
905     transA = false;
906     transB = true;
907     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
908     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
909     break;
910   default:
911     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
912   }
913   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
914   pdata->reusesym = product->api_user;
915 
916   /* TODO: add command line options to select spgemm algorithms */
917   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
918 
919   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
920 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
921   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
922   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
923   #endif
924 #endif
925   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
926 
927   PetscCall(PetscLogGpuTimeBegin());
928   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
929   if (transA) {
930     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
931     transA = false;
932   }
933 
934   if (transB) {
935     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
936     transB = false;
937   }
938 
939   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
940   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
941     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
942     calling new Mat_SeqAIJKokkos().
943     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
944   */
945   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
946 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
947   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
948   auto spgemmHandle = pdata->kh.get_spgemm_handle();
949   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
950 #endif
951   PetscCall(PetscLogGpuTimeEnd());
952 
953   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
954   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
955   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
956   PetscFunctionReturn(PETSC_SUCCESS);
957 }
958 
959 /* handles sparse matrix matrix ops */
960 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
961 {
962   Mat_Product *product = mat->product;
963   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
964 
965   PetscFunctionBegin;
966   MatCheckProduct(mat, 1);
967   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
968   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
969   if (Biskok && Ciskok) {
970     switch (product->type) {
971     case MATPRODUCT_AB:
972     case MATPRODUCT_AtB:
973     case MATPRODUCT_ABt:
974       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
975       break;
976     case MATPRODUCT_PtAP:
977     case MATPRODUCT_RARt:
978     case MATPRODUCT_ABC:
979       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
980       break;
981     default:
982       break;
983     }
984   } else { /* fallback for AIJ */
985     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
986   }
987   PetscFunctionReturn(PETSC_SUCCESS);
988 }
989 
990 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
991 {
992   Mat_SeqAIJKokkos *aijkok;
993 
994   PetscFunctionBegin;
995   PetscCall(PetscLogGpuTimeBegin());
996   PetscCall(MatSeqAIJKokkosSyncDevice(A));
997   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
998   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
999   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1000   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1001   PetscCall(PetscLogGpuTimeEnd());
1002   PetscFunctionReturn(PETSC_SUCCESS);
1003 }
1004 
1005 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1006 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1007 {
1008   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1009 
1010   PetscFunctionBegin;
1011   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1012     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1013 
1014     PetscCall(PetscLogGpuTimeBegin());
1015     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1016     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1017     const auto &Aa     = aijkok->a_dual.view_device();
1018     const auto &Adiag  = aijkok->diag_dual.view_device();
1019     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1020     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1021     PetscCall(PetscLogGpuFlops(n));
1022     PetscCall(PetscLogGpuTimeEnd());
1023   } else { // need reassembly, very slow!
1024     PetscCall(MatShift_Basic(A, a));
1025   }
1026   PetscFunctionReturn(PETSC_SUCCESS);
1027 }
1028 
1029 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1030 {
1031   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1032 
1033   PetscFunctionBegin;
1034   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1035     ConstPetscScalarKokkosView dv;
1036     PetscInt                   n, nv;
1037 
1038     PetscCall(PetscLogGpuTimeBegin());
1039     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1040     PetscCall(VecGetKokkosView(D, &dv));
1041     PetscCall(VecGetLocalSize(D, &nv));
1042     n = PetscMin(Y->rmap->n, Y->cmap->n);
1043     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1044 
1045     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1046     const auto &Aa     = aijkok->a_dual.view_device();
1047     const auto &Adiag  = aijkok->diag_dual.view_device();
1048     PetscCallCXX(Kokkos::parallel_for(
1049       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1050         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1051         else Aa(Adiag(i)) += dv(i);
1052       }));
1053     PetscCall(VecRestoreKokkosView(D, &dv));
1054     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1055     PetscCall(PetscLogGpuFlops(n));
1056     PetscCall(PetscLogGpuTimeEnd());
1057   } else { // need reassembly, very slow!
1058     PetscCall(MatDiagonalSet_Default(Y, D, is));
1059   }
1060   PetscFunctionReturn(PETSC_SUCCESS);
1061 }
1062 
1063 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1064 {
1065   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1066   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1067   ConstPetscScalarKokkosView lv, rv;
1068 
1069   PetscFunctionBegin;
1070   PetscCall(PetscLogGpuTimeBegin());
1071   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1072   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1073   const auto &Aa     = aijkok->a_dual.view_device();
1074   const auto &Ai     = aijkok->i_dual.view_device();
1075   const auto &Aj     = aijkok->j_dual.view_device();
1076   if (ll) {
1077     PetscCall(VecGetLocalSize(ll, &m));
1078     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1079     PetscCall(VecGetKokkosView(ll, &lv));
1080     PetscCallCXX(Kokkos::parallel_for( // for each row
1081       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1082         PetscInt i   = t.league_rank(); // row i
1083         PetscInt len = Ai(i + 1) - Ai(i);
1084         // scale entries on the row
1085         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1086       }));
1087     PetscCall(VecRestoreKokkosView(ll, &lv));
1088     PetscCall(PetscLogGpuFlops(nz));
1089   }
1090   if (rr) {
1091     PetscCall(VecGetLocalSize(rr, &n));
1092     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1093     PetscCall(VecGetKokkosView(rr, &rv));
1094     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1095       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1096     PetscCall(VecRestoreKokkosView(rr, &lv));
1097     PetscCall(PetscLogGpuFlops(nz));
1098   }
1099   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1100   PetscCall(PetscLogGpuTimeEnd());
1101   PetscFunctionReturn(PETSC_SUCCESS);
1102 }
1103 
1104 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1105 {
1106   Mat_SeqAIJKokkos *aijkok;
1107 
1108   PetscFunctionBegin;
1109   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1110   if (aijkok) { /* Only zero the device if data is already there */
1111     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1112     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1113   } else { /* Might be preallocated but not assembled */
1114     PetscCall(MatZeroEntries_SeqAIJ(A));
1115   }
1116   PetscFunctionReturn(PETSC_SUCCESS);
1117 }
1118 
1119 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1120 {
1121   Mat_SeqAIJKokkos     *aijkok;
1122   PetscInt              n;
1123   PetscScalarKokkosView xv;
1124 
1125   PetscFunctionBegin;
1126   PetscCall(VecGetLocalSize(x, &n));
1127   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1128   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1129 
1130   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1131   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1132 
1133   const auto &Aa    = aijkok->a_dual.view_device();
1134   const auto &Ai    = aijkok->i_dual.view_device();
1135   const auto &Adiag = aijkok->diag_dual.view_device();
1136 
1137   PetscCall(VecGetKokkosViewWrite(x, &xv));
1138   Kokkos::parallel_for(
1139     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1140       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1141       else xv(i) = 0;
1142     });
1143   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1144   PetscFunctionReturn(PETSC_SUCCESS);
1145 }
1146 
1147 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1148 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1149 {
1150   Mat_SeqAIJKokkos *aijkok;
1151 
1152   PetscFunctionBegin;
1153   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1154   PetscAssertPointer(kv, 2);
1155   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1156   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1157   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1158   *kv    = aijkok->a_dual.view_device();
1159   PetscFunctionReturn(PETSC_SUCCESS);
1160 }
1161 
1162 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1163 {
1164   PetscFunctionBegin;
1165   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1166   PetscAssertPointer(kv, 2);
1167   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1168   PetscFunctionReturn(PETSC_SUCCESS);
1169 }
1170 
1171 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1172 {
1173   Mat_SeqAIJKokkos *aijkok;
1174 
1175   PetscFunctionBegin;
1176   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1177   PetscAssertPointer(kv, 2);
1178   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1179   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1180   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1181   *kv    = aijkok->a_dual.view_device();
1182   PetscFunctionReturn(PETSC_SUCCESS);
1183 }
1184 
1185 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1186 {
1187   PetscFunctionBegin;
1188   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1189   PetscAssertPointer(kv, 2);
1190   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1191   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1192   PetscFunctionReturn(PETSC_SUCCESS);
1193 }
1194 
1195 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1196 {
1197   Mat_SeqAIJKokkos *aijkok;
1198 
1199   PetscFunctionBegin;
1200   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1201   PetscAssertPointer(kv, 2);
1202   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1203   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1204   *kv    = aijkok->a_dual.view_device();
1205   PetscFunctionReturn(PETSC_SUCCESS);
1206 }
1207 
1208 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1209 {
1210   PetscFunctionBegin;
1211   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1212   PetscAssertPointer(kv, 2);
1213   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1214   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1215   PetscFunctionReturn(PETSC_SUCCESS);
1216 }
1217 
1218 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1219 {
1220   Mat_SeqAIJKokkos *akok;
1221 
1222   PetscFunctionBegin;
1223   auto exec = PetscGetKokkosExecutionSpace();
1224   // Create host copies of the input aij
1225   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1226   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1227   // Don't copy the vals to the host now
1228   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1229 
1230   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1231   // Note we have modified device data so it will copy lazily
1232   a_dual.modify_device();
1233   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1234   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1235 
1236   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1237   PetscCall(MatCreate(comm, A));
1238   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1239   PetscFunctionReturn(PETSC_SUCCESS);
1240 }
1241 
1242 /* Computes Y += alpha X */
1243 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1244 {
1245   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1246   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1247   ConstMatScalarKokkosView Xa;
1248   MatScalarKokkosView      Ya;
1249   auto                     exec = PetscGetKokkosExecutionSpace();
1250 
1251   PetscFunctionBegin;
1252   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1253   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1254   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1255   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1256   PetscCall(PetscLogGpuTimeBegin());
1257 
1258   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1259     PetscBool e;
1260     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1261     if (e) {
1262       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1263       if (e) pattern = SAME_NONZERO_PATTERN;
1264     }
1265   }
1266 
1267   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1268     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1269     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1270     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1271   */
1272   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1273   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1274   Xa   = xkok->a_dual.view_device();
1275   Ya   = ykok->a_dual.view_device();
1276 
1277   if (pattern == SAME_NONZERO_PATTERN) {
1278     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1279     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1280   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1281     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1282     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1283 
1284     Kokkos::parallel_for(
1285       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1286         PetscInt i = t.league_rank(); // row i
1287         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1288           // Only one thread works in a team
1289           PetscInt p, q = Yi(i);
1290           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1291             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1292             if (Xj(p) == Yj(q)) {                        // Found it
1293               Ya(q) += alpha * Xa(p);
1294               q++;
1295             } else {
1296             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1297             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1298 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1299               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1300 #else
1301               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1302 #endif
1303             }
1304           }
1305         });
1306       });
1307     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1308   } else { // different nonzero patterns
1309     Mat             Z;
1310     KokkosCsrMatrix zcsr;
1311     KernelHandle    kh;
1312     kh.create_spadd_handle(true); // X, Y are sorted
1313     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1314     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1315     zkok = new Mat_SeqAIJKokkos(zcsr);
1316     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1317     PetscCall(MatHeaderReplace(Y, &Z));
1318     kh.destroy_spadd_handle();
1319   }
1320   PetscCall(PetscLogGpuTimeEnd());
1321   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1322   PetscFunctionReturn(PETSC_SUCCESS);
1323 }
1324 
1325 struct MatCOOStruct_SeqAIJKokkos {
1326   PetscCount           n;
1327   PetscCount           Atot;
1328   PetscInt             nz;
1329   PetscCountKokkosView jmap;
1330   PetscCountKokkosView perm;
1331 
1332   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1333   {
1334     nz   = coo_h->nz;
1335     n    = coo_h->n;
1336     Atot = coo_h->Atot;
1337     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1338     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1339   }
1340 };
1341 
1342 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1343 {
1344   PetscFunctionBegin;
1345   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1346   PetscFunctionReturn(PETSC_SUCCESS);
1347 }
1348 
1349 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1350 {
1351   Mat_SeqAIJKokkos          *akok;
1352   Mat_SeqAIJ                *aseq;
1353   PetscContainer             container_h;
1354   MatCOOStruct_SeqAIJ       *coo_h;
1355   MatCOOStruct_SeqAIJKokkos *coo_d;
1356 
1357   PetscFunctionBegin;
1358   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1359   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1360   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1361   delete akok;
1362   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1363   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1364 
1365   // Copy the COO struct to device
1366   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1367   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1368   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1369 
1370   // Put the COO struct in a container and then attach that to the matrix
1371   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1372   PetscFunctionReturn(PETSC_SUCCESS);
1373 }
1374 
1375 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1376 {
1377   MatScalarKokkosView        Aa;
1378   ConstMatScalarKokkosView   kv;
1379   PetscMemType               memtype;
1380   PetscContainer             container;
1381   MatCOOStruct_SeqAIJKokkos *coo;
1382 
1383   PetscFunctionBegin;
1384   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1385   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1386 
1387   const auto &n    = coo->n;
1388   const auto &Annz = coo->nz;
1389   const auto &jmap = coo->jmap;
1390   const auto &perm = coo->perm;
1391 
1392   PetscCall(PetscGetMemType(v, &memtype));
1393   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1394     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1395   } else {
1396     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1397   }
1398 
1399   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1400   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1401 
1402   PetscCall(PetscLogGpuTimeBegin());
1403   Kokkos::parallel_for(
1404     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1405       PetscScalar sum = 0.0;
1406       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1407       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1408     });
1409   PetscCall(PetscLogGpuTimeEnd());
1410 
1411   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1412   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1413   PetscFunctionReturn(PETSC_SUCCESS);
1414 }
1415 
1416 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1417 {
1418   PetscFunctionBegin;
1419   PetscCall(MatSeqAIJKokkosSyncHost(A));
1420   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1421   B->offloadmask = PETSC_OFFLOAD_CPU;
1422   PetscFunctionReturn(PETSC_SUCCESS);
1423 }
1424 
1425 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1426 {
1427   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1428 
1429   PetscFunctionBegin;
1430   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1431   A->boundtocpu  = PETSC_FALSE;
1432 
1433   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1434   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1435   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1436   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1437   A->ops->scale                     = MatScale_SeqAIJKokkos;
1438   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1439   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1440   A->ops->mult                      = MatMult_SeqAIJKokkos;
1441   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1442   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1443   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1444   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1445   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1446   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1447   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1448   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1449   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1450   A->ops->shift                     = MatShift_SeqAIJKokkos;
1451   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1452   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1453   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1454   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1455   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1456   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1457   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1458   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1459   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1460 
1461   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1462   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1463 #if defined(PETSC_HAVE_HYPRE)
1464   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1465 #endif
1466   PetscFunctionReturn(PETSC_SUCCESS);
1467 }
1468 
1469 /*
1470    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1471 
1472   Input Parameters:
1473 +  A       - the MATSEQAIJKOKKOS matrix
1474 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1475 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1476 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1477 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1478 
1479   Output Parameter:
1480 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1481 */
1482 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1483 {
1484   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1485   PetscInt          nblocks = bs.extent(0) - 1;
1486 
1487   PetscFunctionBegin;
1488   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1489 
1490   // Pull out the diagonal blocks of the matrix and then invert the blocks
1491   auto Aa    = akok->a_dual.view_device();
1492   auto Ai    = akok->i_dual.view_device();
1493   auto Aj    = akok->j_dual.view_device();
1494   auto Adiag = akok->diag_dual.view_device();
1495   // TODO: how to tune the team size?
1496 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1497   auto ts = Kokkos::AUTO();
1498 #else
1499   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1500 #endif
1501   PetscCallCXX(Kokkos::parallel_for(
1502     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1503       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1504       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1505       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1506       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1507       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1508 
1509       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1510         PetscInt i = rstart + r;                                                            // i-th row in A
1511 
1512         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1513           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1514 
1515           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1516             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1517               B(r, c) = 0.0;
1518             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1519               B(r, c) = Aa(first + c);
1520             } else { // this entry does not show up in the CSR
1521               B(r, c) = 0.0;
1522             }
1523           }
1524         } else { // rare case that the diagonal does not exist
1525           const PetscInt begin = Ai(i);
1526           const PetscInt end   = Ai(i + 1);
1527           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1528           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1529             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1530             else if (Aj(j) >= rstart + m) break;
1531           }
1532         }
1533       });
1534 
1535       // LU-decompose B (w/o pivoting) and then invert B
1536       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1537       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1538     }));
1539   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1540   PetscFunctionReturn(PETSC_SUCCESS);
1541 }
1542 
1543 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1544 {
1545   Mat_SeqAIJ *aseq;
1546   PetscInt    i, m, n;
1547   auto        exec = PetscGetKokkosExecutionSpace();
1548 
1549   PetscFunctionBegin;
1550   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1551 
1552   m = akok->nrows();
1553   n = akok->ncols();
1554   PetscCall(MatSetSizes(A, m, n, m, n));
1555   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1556 
1557   /* Set up data structures of A as a MATSEQAIJ */
1558   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1559   aseq = (Mat_SeqAIJ *)A->data;
1560 
1561   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1562   PetscCallCXX(akok->j_dual.sync_host(exec));
1563   PetscCallCXX(exec.fence());
1564 
1565   aseq->i       = akok->i_host_data();
1566   aseq->j       = akok->j_host_data();
1567   aseq->a       = akok->a_host_data();
1568   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1569   aseq->free_a  = PETSC_FALSE;
1570   aseq->free_ij = PETSC_FALSE;
1571   aseq->nz      = akok->nnz();
1572   aseq->maxnz   = aseq->nz;
1573 
1574   PetscCall(PetscMalloc1(m, &aseq->imax));
1575   PetscCall(PetscMalloc1(m, &aseq->ilen));
1576   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1577 
1578   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1579   akok->nonzerostate = A->nonzerostate;
1580   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1581   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1582   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1583   PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585 
1586 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1587 {
1588   PetscFunctionBegin;
1589   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1590   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1591   PetscFunctionReturn(PETSC_SUCCESS);
1592 }
1593 
1594 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1595 {
1596   Mat_SeqAIJKokkos *akok;
1597 
1598   PetscFunctionBegin;
1599   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1600   PetscCall(MatCreate(comm, A));
1601   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1602   PetscFunctionReturn(PETSC_SUCCESS);
1603 }
1604 
1605 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1606 
1607    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1608  */
1609 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1610 {
1611   PetscFunctionBegin;
1612   PetscCall(MatCreate(comm, A));
1613   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1614   PetscFunctionReturn(PETSC_SUCCESS);
1615 }
1616 
1617 /*@C
1618   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1619   (the default parallel PETSc format). This matrix will ultimately be handled by
1620   Kokkos for calculations.
1621 
1622   Collective
1623 
1624   Input Parameters:
1625 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1626 . m    - number of rows
1627 . n    - number of columns
1628 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1629 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1630 
1631   Output Parameter:
1632 . A - the matrix
1633 
1634   Level: intermediate
1635 
1636   Notes:
1637   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1638   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1639   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1640 
1641   The AIJ format, also called
1642   compressed row storage, is fully compatible with standard Fortran
1643   storage.  That is, the stored row and column indices can begin at
1644   either one (as in Fortran) or zero.
1645 
1646   Specify the preallocated storage with either `nz` or `nnz` (not both).
1647   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1648   allocation.
1649 
1650 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1651 @*/
1652 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1653 {
1654   PetscFunctionBegin;
1655   PetscCall(PetscKokkosInitializeCheck());
1656   PetscCall(MatCreate(comm, A));
1657   PetscCall(MatSetSizes(*A, m, n, m, n));
1658   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1659   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1660   PetscFunctionReturn(PETSC_SUCCESS);
1661 }
1662 
1663 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1664 {
1665   PetscFunctionBegin;
1666   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1667   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1668   PetscFunctionReturn(PETSC_SUCCESS);
1669 }
1670 
1671 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1672 {
1673   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1674 
1675   PetscFunctionBegin;
1676   if (!factors->sptrsv_symbolic_completed) {
1677     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1678     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1679     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1680   }
1681   PetscFunctionReturn(PETSC_SUCCESS);
1682 }
1683 
1684 /* Check if we need to update factors etc for transpose solve */
1685 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1686 {
1687   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1688   MatColIdxType               n       = A->rmap->n;
1689 
1690   PetscFunctionBegin;
1691   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1692     /* Update L^T and do sptrsv symbolic */
1693     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1694     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1695     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1696 
1697     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1698                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1699 
1700     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1701       We have to sort the indices, until KK provides finer control options.
1702     */
1703     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1704 
1705     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1706 
1707     /* Update U^T and do sptrsv symbolic */
1708     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1709     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1710     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1711 
1712     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1713                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1714 
1715     /* Sort indices. See comments above */
1716     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1717 
1718     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1719     factors->transpose_updated = PETSC_TRUE;
1720   }
1721   PetscFunctionReturn(PETSC_SUCCESS);
1722 }
1723 
1724 /* Solve Ax = b, with A = LU */
1725 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1726 {
1727   ConstPetscScalarKokkosView  bv;
1728   PetscScalarKokkosView       xv;
1729   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1730 
1731   PetscFunctionBegin;
1732   PetscCall(PetscLogGpuTimeBegin());
1733   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1734   PetscCall(VecGetKokkosView(b, &bv));
1735   PetscCall(VecGetKokkosViewWrite(x, &xv));
1736   /* Solve L tmpv = b */
1737   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1738   /* Solve Ux = tmpv */
1739   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1740   PetscCall(VecRestoreKokkosView(b, &bv));
1741   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1742   PetscCall(PetscLogGpuTimeEnd());
1743   PetscFunctionReturn(PETSC_SUCCESS);
1744 }
1745 
1746 /* Solve A^T x = b, where A^T = U^T L^T */
1747 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1748 {
1749   ConstPetscScalarKokkosView  bv;
1750   PetscScalarKokkosView       xv;
1751   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1752 
1753   PetscFunctionBegin;
1754   PetscCall(PetscLogGpuTimeBegin());
1755   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1756   PetscCall(VecGetKokkosView(b, &bv));
1757   PetscCall(VecGetKokkosViewWrite(x, &xv));
1758   /* Solve U^T tmpv = b */
1759   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1760 
1761   /* Solve L^T x = tmpv */
1762   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1763   PetscCall(VecRestoreKokkosView(b, &bv));
1764   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1765   PetscCall(PetscLogGpuTimeEnd());
1766   PetscFunctionReturn(PETSC_SUCCESS);
1767 }
1768 
1769 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1770 {
1771   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1772   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1773   PetscInt                    fill_lev = info->levels;
1774 
1775   PetscFunctionBegin;
1776   PetscCall(PetscLogGpuTimeBegin());
1777   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1778 
1779   auto a_d = aijkok->a_dual.view_device();
1780   auto i_d = aijkok->i_dual.view_device();
1781   auto j_d = aijkok->j_dual.view_device();
1782 
1783   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1784 
1785   B->assembled              = PETSC_TRUE;
1786   B->preallocated           = PETSC_TRUE;
1787   B->ops->solve             = MatSolve_SeqAIJKokkos;
1788   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1789   B->ops->matsolve          = NULL;
1790   B->ops->matsolvetranspose = NULL;
1791   B->offloadmask            = PETSC_OFFLOAD_GPU;
1792 
1793   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1794   factors->transpose_updated         = PETSC_FALSE;
1795   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1796   /* TODO: log flops, but how to know that? */
1797   PetscCall(PetscLogGpuTimeEnd());
1798   PetscFunctionReturn(PETSC_SUCCESS);
1799 }
1800 
1801 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1802 {
1803   Mat_SeqAIJKokkos           *aijkok;
1804   Mat_SeqAIJ                 *b;
1805   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1806   PetscInt                    fill_lev = info->levels;
1807   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1808   PetscInt                    n        = A->rmap->n;
1809 
1810   PetscFunctionBegin;
1811   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1812   /* Rebuild factors */
1813   if (factors) {
1814     factors->Destroy();
1815   } /* Destroy the old if it exists */
1816   else {
1817     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1818   }
1819 
1820   /* Create a spiluk handle and then do symbolic factorization */
1821   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1822   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1823 
1824   auto spiluk_handle = factors->kh.get_spiluk_handle();
1825 
1826   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1827   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1828   Kokkos::realloc(factors->iU_d, n + 1);
1829   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1830 
1831   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1832   auto i_d = aijkok->i_dual.view_device();
1833   auto j_d = aijkok->j_dual.view_device();
1834   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1835   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1836 
1837   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1838   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1839   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1840   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1841 
1842   /* TODO: add options to select sptrsv algorithms */
1843   /* Create sptrsv handles for L, U and their transpose */
1844 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1845   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1846 #else
1847   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1848 #endif
1849 
1850   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1851   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1852   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1853   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1854 
1855   /* Fill fields of the factor matrix B */
1856   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1857   b     = (Mat_SeqAIJ *)B->data;
1858   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1859   B->info.fill_ratio_given  = info->fill;
1860   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1861 
1862   B->offloadmask          = PETSC_OFFLOAD_GPU;
1863   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1864   PetscFunctionReturn(PETSC_SUCCESS);
1865 }
1866 
1867 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1868 {
1869   PetscFunctionBegin;
1870   *type = MATSOLVERKOKKOS;
1871   PetscFunctionReturn(PETSC_SUCCESS);
1872 }
1873 
1874 /*MC
1875   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1876   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1877 
1878   Level: beginner
1879 
1880 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1881 M*/
1882 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1883 {
1884   PetscInt n = A->rmap->n;
1885 
1886   PetscFunctionBegin;
1887   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1888   PetscCall(MatSetSizes(*B, n, n, n, n));
1889   (*B)->factortype = ftype;
1890   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1891   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1892 
1893   if (ftype == MAT_FACTOR_LU) {
1894     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1895     (*B)->canuseordering        = PETSC_TRUE;
1896     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1897   } else if (ftype == MAT_FACTOR_ILU) {
1898     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1899     (*B)->canuseordering         = PETSC_FALSE;
1900     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1901   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1902 
1903   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1904   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1905   PetscFunctionReturn(PETSC_SUCCESS);
1906 }
1907 
1908 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1909 {
1910   PetscFunctionBegin;
1911   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1912   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1913   PetscFunctionReturn(PETSC_SUCCESS);
1914 }
1915 
1916 /* Utility to print out a KokkosCsrMatrix for debugging */
1917 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1918 {
1919   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
1920   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
1921   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
1922   const PetscInt    *i  = iv.data();
1923   const PetscInt    *j  = jv.data();
1924   const PetscScalar *a  = av.data();
1925   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1926 
1927   PetscFunctionBegin;
1928   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1929   for (PetscInt k = 0; k < m; k++) {
1930     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1931     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1932     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1933   }
1934   PetscFunctionReturn(PETSC_SUCCESS);
1935 }
1936