xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 26077dcc81241ba0e0025c942776d1e28062c19a)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscpkg_version.h>
4 #include <petsc/private/petscimpl.h>
5 #include <petsc/private/sfimpl.h>
6 #include <petscsystypes.h>
7 #include <petscerror.h>
8 
9 #include <Kokkos_Core.hpp>
10 #include <KokkosBlas.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 
13 // To suppress compiler warnings:
14 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
15 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
16 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
17 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
18 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
19 #include <KokkosSparse_spmv.hpp>
20 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
21 
22 #include <KokkosSparse_spiluk.hpp>
23 #include <KokkosSparse_sptrsv.hpp>
24 #include <KokkosSparse_spgemm.hpp>
25 #include <KokkosSparse_spadd.hpp>
26 #include <KokkosBatched_LU_Decl.hpp>
27 #include <KokkosBatched_InverseLU_Decl.hpp>
28 
29 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
30 
31 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
32   #include <KokkosSparse_Utils.hpp>
33 using KokkosSparse::sort_crs_matrix;
34 using KokkosSparse::Impl::transpose_matrix;
35 #else
36   #include <KokkosKernels_Sorting.hpp>
37 using KokkosKernels::sort_crs_matrix;
38 using KokkosKernels::Impl::transpose_matrix;
39 #endif
40 
41 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
42 
43 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
44    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
45    In the latter case, it is important to set a_dual's sync state correctly.
46  */
47 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
48 {
49   Mat_SeqAIJ       *aijseq;
50   Mat_SeqAIJKokkos *aijkok;
51 
52   PetscFunctionBegin;
53   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
54   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
55 
56   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
57   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
58 
59   /* If aijkok does not exist, we just copy i, j to device.
60      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
61      In both cases, we build a new aijkok structure.
62   */
63   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
64     delete aijkok;
65     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
66     A->spptr = aijkok;
67   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
68     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
69     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
70     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
71   }
72   PetscFunctionReturn(PETSC_SUCCESS);
73 }
74 
75 /* Sync CSR data to device if not yet */
76 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
77 {
78   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
79 
80   PetscFunctionBegin;
81   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
82   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
83   if (aijkok->a_dual.need_sync_device()) {
84     aijkok->a_dual.sync_device();
85     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
86     aijkok->hermitian_updated = PETSC_FALSE;
87   }
88   PetscFunctionReturn(PETSC_SUCCESS);
89 }
90 
91 /* Mark the CSR data on device as modified */
92 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
93 {
94   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
95 
96   PetscFunctionBegin;
97   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
98   aijkok->a_dual.clear_sync_state();
99   aijkok->a_dual.modify_device();
100   aijkok->transpose_updated = PETSC_FALSE;
101   aijkok->hermitian_updated = PETSC_FALSE;
102   PetscCall(MatSeqAIJInvalidateDiagonal(A));
103   PetscCall(PetscObjectStateIncrease((PetscObject)A));
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 
107 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
108 {
109   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
110   auto              exec   = PetscGetKokkosExecutionSpace();
111 
112   PetscFunctionBegin;
113   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
114   /* We do not expect one needs factors on host  */
115   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
116   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
117   PetscCallCXX(aijkok->a_dual.sync_host(exec));
118   PetscCallCXX(exec.fence());
119   PetscFunctionReturn(PETSC_SUCCESS);
120 }
121 
122 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
123 {
124   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
125 
126   PetscFunctionBegin;
127   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
128     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
129     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
130     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
131   */
132   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
133     auto exec = PetscGetKokkosExecutionSpace();
134     PetscCallCXX(aijkok->a_dual.sync_host(exec));
135     PetscCallCXX(exec.fence());
136     *array = aijkok->a_dual.view_host().data();
137   } else { /* Happens when calling MatSetValues on a newly created matrix */
138     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
139   }
140   PetscFunctionReturn(PETSC_SUCCESS);
141 }
142 
143 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
144 {
145   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
146 
147   PetscFunctionBegin;
148   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
149   PetscFunctionReturn(PETSC_SUCCESS);
150 }
151 
152 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
153 {
154   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
155 
156   PetscFunctionBegin;
157   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
158     auto exec = PetscGetKokkosExecutionSpace();
159     PetscCallCXX(aijkok->a_dual.sync_host(exec));
160     PetscCallCXX(exec.fence());
161     *array = aijkok->a_dual.view_host().data();
162   } else {
163     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
164   }
165   PetscFunctionReturn(PETSC_SUCCESS);
166 }
167 
168 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
169 {
170   PetscFunctionBegin;
171   *array = NULL;
172   PetscFunctionReturn(PETSC_SUCCESS);
173 }
174 
175 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
176 {
177   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
178 
179   PetscFunctionBegin;
180   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
181     *array = aijkok->a_dual.view_host().data();
182   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
183     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
184   }
185   PetscFunctionReturn(PETSC_SUCCESS);
186 }
187 
188 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
189 {
190   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
191 
192   PetscFunctionBegin;
193   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
194     aijkok->a_dual.clear_sync_state();
195     aijkok->a_dual.modify_host();
196   }
197   PetscFunctionReturn(PETSC_SUCCESS);
198 }
199 
200 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
201 {
202   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
203 
204   PetscFunctionBegin;
205   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
206 
207   if (i) *i = aijkok->i_device_data();
208   if (j) *j = aijkok->j_device_data();
209   if (a) {
210     aijkok->a_dual.sync_device();
211     *a = aijkok->a_device_data();
212   }
213   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
214   PetscFunctionReturn(PETSC_SUCCESS);
215 }
216 
217 /*
218   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
219 
220   Input Parameter:
221 .  A       - the MATSEQAIJKOKKOS matrix
222 
223   Output Parameters:
224 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
225 -  T_d    - the transpose on device, whose value array is allocated but not initialized
226 */
227 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
228 {
229   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
230   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
231   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
232   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
233   MatRowMapType          *Ti = Ti_h.data();
234   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
235   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
236   PetscInt               *Tj   = Tj_h.data();
237   PetscInt               *perm = perm_h.data();
238   PetscInt               *offset;
239 
240   PetscFunctionBegin;
241   // Populate Ti
242   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
243   Ti++;
244   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
245   Ti--;
246   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
247 
248   // Populate Tj and the permutation array
249   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
250   for (PetscInt i = 0; i < m; i++) {
251     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
252       PetscInt r    = Aj[j];                       // row r of T
253       PetscInt disp = Ti[r] + offset[r];
254 
255       Tj[disp]   = i; // col i of T
256       perm[disp] = j;
257       offset[r]++;
258     }
259   }
260   PetscCall(PetscFree(offset));
261 
262   // Sort each row of T, along with the permutation array
263   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
264 
265   // Output perm and T on device
266   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
267   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
268   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
269   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
270   PetscFunctionReturn(PETSC_SUCCESS);
271 }
272 
273 // Generate the transpose on device and cache it internally
274 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
275 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
276 {
277   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
278   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
279   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
280   KokkosCsrMatrix  &T = akok->csrmatT;
281 
282   PetscFunctionBegin;
283   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
284   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
285 
286   const auto &Aa = akok->a_dual.view_device();
287 
288   if (A->symmetric == PETSC_BOOL3_TRUE) {
289     *csrmatT = akok->csrmat;
290   } else {
291     // See if we already have a cached transpose and its value is up to date
292     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
293       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
294         const auto &perm = akok->transpose_perm; // get the permutation array
295         auto       &Ta   = T.values;
296 
297         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
298       }
299     } else { // Generate T of size n x m for the first time
300       MatRowMapKokkosView perm;
301 
302       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
303       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
304       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
305     }
306     akok->transpose_updated = PETSC_TRUE;
307     *csrmatT                = akok->csrmatT;
308   }
309   PetscFunctionReturn(PETSC_SUCCESS);
310 }
311 
312 // Generate the Hermitian on device and cache it internally
313 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
314 {
315   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
316   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
317   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
318   KokkosCsrMatrix  &T = akok->csrmatH;
319 
320   PetscFunctionBegin;
321   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
322   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
323 
324   const auto &Aa = akok->a_dual.view_device();
325 
326   if (A->hermitian == PETSC_BOOL3_TRUE) {
327     *csrmatH = akok->csrmat;
328   } else {
329     // See if we already have a cached hermitian and its value is up to date
330     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
331       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
332         const auto &perm = akok->transpose_perm; // get the permutation array
333         auto       &Ta   = T.values;
334 
335         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
336       }
337     } else { // Generate T of size n x m for the first time
338       MatRowMapKokkosView perm;
339 
340       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
341       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
342       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
343     }
344     akok->hermitian_updated = PETSC_TRUE;
345     *csrmatH                = akok->csrmatH;
346   }
347   PetscFunctionReturn(PETSC_SUCCESS);
348 }
349 
350 /* y = A x */
351 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
352 {
353   Mat_SeqAIJKokkos          *aijkok;
354   ConstPetscScalarKokkosView xv;
355   PetscScalarKokkosView      yv;
356 
357   PetscFunctionBegin;
358   PetscCall(PetscLogGpuTimeBegin());
359   PetscCall(MatSeqAIJKokkosSyncDevice(A));
360   PetscCall(VecGetKokkosView(xx, &xv));
361   PetscCall(VecGetKokkosViewWrite(yy, &yv));
362   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
363   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
364   PetscCall(VecRestoreKokkosView(xx, &xv));
365   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
366   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
367   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
368   PetscCall(PetscLogGpuTimeEnd());
369   PetscFunctionReturn(PETSC_SUCCESS);
370 }
371 
372 /* y = A^T x */
373 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
374 {
375   Mat_SeqAIJKokkos          *aijkok;
376   const char                *mode;
377   ConstPetscScalarKokkosView xv;
378   PetscScalarKokkosView      yv;
379   KokkosCsrMatrix            csrmat;
380 
381   PetscFunctionBegin;
382   PetscCall(PetscLogGpuTimeBegin());
383   PetscCall(MatSeqAIJKokkosSyncDevice(A));
384   PetscCall(VecGetKokkosView(xx, &xv));
385   PetscCall(VecGetKokkosViewWrite(yy, &yv));
386   if (A->form_explicit_transpose) {
387     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
388     mode = "N";
389   } else {
390     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
391     csrmat = aijkok->csrmat;
392     mode   = "T";
393   }
394   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
395   PetscCall(VecRestoreKokkosView(xx, &xv));
396   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
397   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
398   PetscCall(PetscLogGpuTimeEnd());
399   PetscFunctionReturn(PETSC_SUCCESS);
400 }
401 
402 /* y = A^H x */
403 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
404 {
405   Mat_SeqAIJKokkos          *aijkok;
406   const char                *mode;
407   ConstPetscScalarKokkosView xv;
408   PetscScalarKokkosView      yv;
409   KokkosCsrMatrix            csrmat;
410 
411   PetscFunctionBegin;
412   PetscCall(PetscLogGpuTimeBegin());
413   PetscCall(MatSeqAIJKokkosSyncDevice(A));
414   PetscCall(VecGetKokkosView(xx, &xv));
415   PetscCall(VecGetKokkosViewWrite(yy, &yv));
416   if (A->form_explicit_transpose) {
417     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
418     mode = "N";
419   } else {
420     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
421     csrmat = aijkok->csrmat;
422     mode   = "C";
423   }
424   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
425   PetscCall(VecRestoreKokkosView(xx, &xv));
426   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
427   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
428   PetscCall(PetscLogGpuTimeEnd());
429   PetscFunctionReturn(PETSC_SUCCESS);
430 }
431 
432 /* z = A x + y */
433 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
434 {
435   Mat_SeqAIJKokkos          *aijkok;
436   ConstPetscScalarKokkosView xv;
437   PetscScalarKokkosView      zv;
438 
439   PetscFunctionBegin;
440   PetscCall(PetscLogGpuTimeBegin());
441   PetscCall(MatSeqAIJKokkosSyncDevice(A));
442   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
443   PetscCall(VecGetKokkosView(xx, &xv));
444   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
445   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
446   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
447   PetscCall(VecRestoreKokkosView(xx, &xv));
448   PetscCall(VecRestoreKokkosView(zz, &zv));
449   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
450   PetscCall(PetscLogGpuTimeEnd());
451   PetscFunctionReturn(PETSC_SUCCESS);
452 }
453 
454 /* z = A^T x + y */
455 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
456 {
457   Mat_SeqAIJKokkos          *aijkok;
458   const char                *mode;
459   ConstPetscScalarKokkosView xv;
460   PetscScalarKokkosView      zv;
461   KokkosCsrMatrix            csrmat;
462 
463   PetscFunctionBegin;
464   PetscCall(PetscLogGpuTimeBegin());
465   PetscCall(MatSeqAIJKokkosSyncDevice(A));
466   if (zz != yy) PetscCall(VecCopy(yy, zz));
467   PetscCall(VecGetKokkosView(xx, &xv));
468   PetscCall(VecGetKokkosView(zz, &zv));
469   if (A->form_explicit_transpose) {
470     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
471     mode = "N";
472   } else {
473     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
474     csrmat = aijkok->csrmat;
475     mode   = "T";
476   }
477   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
478   PetscCall(VecRestoreKokkosView(xx, &xv));
479   PetscCall(VecRestoreKokkosView(zz, &zv));
480   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
481   PetscCall(PetscLogGpuTimeEnd());
482   PetscFunctionReturn(PETSC_SUCCESS);
483 }
484 
485 /* z = A^H x + y */
486 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
487 {
488   Mat_SeqAIJKokkos          *aijkok;
489   const char                *mode;
490   ConstPetscScalarKokkosView xv;
491   PetscScalarKokkosView      zv;
492   KokkosCsrMatrix            csrmat;
493 
494   PetscFunctionBegin;
495   PetscCall(PetscLogGpuTimeBegin());
496   PetscCall(MatSeqAIJKokkosSyncDevice(A));
497   if (zz != yy) PetscCall(VecCopy(yy, zz));
498   PetscCall(VecGetKokkosView(xx, &xv));
499   PetscCall(VecGetKokkosView(zz, &zv));
500   if (A->form_explicit_transpose) {
501     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
502     mode = "N";
503   } else {
504     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
505     csrmat = aijkok->csrmat;
506     mode   = "C";
507   }
508   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
509   PetscCall(VecRestoreKokkosView(xx, &xv));
510   PetscCall(VecRestoreKokkosView(zz, &zv));
511   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
512   PetscCall(PetscLogGpuTimeEnd());
513   PetscFunctionReturn(PETSC_SUCCESS);
514 }
515 
516 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
517 {
518   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
519 
520   PetscFunctionBegin;
521   switch (op) {
522   case MAT_FORM_EXPLICIT_TRANSPOSE:
523     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
524     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
525     A->form_explicit_transpose = flg;
526     break;
527   default:
528     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
529     break;
530   }
531   PetscFunctionReturn(PETSC_SUCCESS);
532 }
533 
534 /* Depending on reuse, either build a new mat, or use the existing mat */
535 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
536 {
537   Mat_SeqAIJ *aseq;
538 
539   PetscFunctionBegin;
540   PetscCall(PetscKokkosInitializeCheck());
541   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
542     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
543   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
544     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
545   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
546     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
547     PetscCall(PetscFree(A->defaultvectype));
548     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
549     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
550     PetscCall(MatSetOps_SeqAIJKokkos(A));
551     aseq = static_cast<Mat_SeqAIJ *>(A->data);
552     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
553       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
554       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
555     }
556   }
557   PetscFunctionReturn(PETSC_SUCCESS);
558 }
559 
560 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
561    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
562  */
563 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
564 {
565   Mat_SeqAIJ       *bseq;
566   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
567   Mat               mat;
568 
569   PetscFunctionBegin;
570   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
571   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
572   mat = *B;
573   if (A->assembled) {
574     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
575     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
576     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
577     /* Now copy values to B if needed */
578     if (dupOption == MAT_COPY_VALUES) {
579       if (akok->a_dual.need_sync_device()) {
580         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
581         bkok->a_dual.modify_host();
582       } else { /* If device has the latest data, we only copy data on device */
583         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
584         bkok->a_dual.modify_device();
585       }
586     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
587       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
588       bkok->a_dual.modify_host();
589     }
590     mat->spptr = bkok;
591   }
592 
593   PetscCall(PetscFree(mat->defaultvectype));
594   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
595   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
596   PetscCall(MatSetOps_SeqAIJKokkos(mat));
597   PetscFunctionReturn(PETSC_SUCCESS);
598 }
599 
600 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
601 {
602   Mat               At;
603   KokkosCsrMatrix   internT;
604   Mat_SeqAIJKokkos *atkok, *bkok;
605 
606   PetscFunctionBegin;
607   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
608   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
609   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
610     /* Deep copy internT, as we want to isolate the internal transpose */
611     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
612     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
613     if (reuse == MAT_INITIAL_MATRIX) *B = At;
614     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
615   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
616     if ((*B)->assembled) {
617       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
618       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
619       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
620     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
621       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
622       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
623       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
624       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
625       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
626     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
627   }
628   PetscFunctionReturn(PETSC_SUCCESS);
629 }
630 
631 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
632 {
633   Mat_SeqAIJKokkos *aijkok;
634 
635   PetscFunctionBegin;
636   if (A->factortype == MAT_FACTOR_NONE) {
637     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
638     delete aijkok;
639   } else {
640     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
641   }
642   A->spptr = NULL;
643   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
644   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
645   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
646 #if defined(PETSC_HAVE_HYPRE)
647   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
648 #endif
649   PetscCall(MatDestroy_SeqAIJ(A));
650   PetscFunctionReturn(PETSC_SUCCESS);
651 }
652 
653 /*MC
654    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
655 
656    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
657 
658    Options Database Key:
659 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
660 
661   Level: beginner
662 
663 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
664 M*/
665 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
666 {
667   PetscFunctionBegin;
668   PetscCall(PetscKokkosInitializeCheck());
669   PetscCall(MatCreate_SeqAIJ(A));
670   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
671   PetscFunctionReturn(PETSC_SUCCESS);
672 }
673 
674 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
675 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
676 {
677   Mat_SeqAIJ         *a, *b;
678   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
679   MatScalarKokkosView aa, ba, ca;
680   MatRowMapKokkosView ai, bi, ci;
681   MatColIdxKokkosView aj, bj, cj;
682   PetscInt            m, n, nnz, aN;
683 
684   PetscFunctionBegin;
685   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
686   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
687   PetscAssertPointer(C, 4);
688   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
689   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
690   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
691   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
692 
693   PetscCall(MatSeqAIJKokkosSyncDevice(A));
694   PetscCall(MatSeqAIJKokkosSyncDevice(B));
695   a    = static_cast<Mat_SeqAIJ *>(A->data);
696   b    = static_cast<Mat_SeqAIJ *>(B->data);
697   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
698   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
699   aa   = akok->a_dual.view_device();
700   ai   = akok->i_dual.view_device();
701   ba   = bkok->a_dual.view_device();
702   bi   = bkok->i_dual.view_device();
703   m    = A->rmap->n; /* M, N and nnz of C */
704   n    = A->cmap->n + B->cmap->n;
705   nnz  = a->nz + b->nz;
706   aN   = A->cmap->n; /* N of A */
707   if (reuse == MAT_INITIAL_MATRIX) {
708     aj           = akok->j_dual.view_device();
709     bj           = bkok->j_dual.view_device();
710     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
711     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
712     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
713     ca           = ca_dual.view_device();
714     ci           = ci_dual.view_device();
715     cj           = cj_dual.view_device();
716 
717     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
718     Kokkos::parallel_for(
719       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
720         PetscInt i       = t.league_rank(); /* row i */
721         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
722 
723         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
724                                                    ci(i) = coffset;
725                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
726         });
727 
728         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
729           if (k < alen) {
730             ca(coffset + k) = aa(ai(i) + k);
731             cj(coffset + k) = aj(ai(i) + k);
732           } else {
733             ca(coffset + k) = ba(bi(i) + k - alen);
734             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
735           }
736         });
737       });
738     ca_dual.modify_device();
739     ci_dual.modify_device();
740     cj_dual.modify_device();
741     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
742     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
743   } else if (reuse == MAT_REUSE_MATRIX) {
744     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
745     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
746     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
747     ca   = ckok->a_dual.view_device();
748     ci   = ckok->i_dual.view_device();
749 
750     Kokkos::parallel_for(
751       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
752         PetscInt i    = t.league_rank(); /* row i */
753         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
754         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
755           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
756           else ca(ci(i) + k) = ba(bi(i) + k - alen);
757         });
758       });
759     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
760   }
761   PetscFunctionReturn(PETSC_SUCCESS);
762 }
763 
764 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
765 {
766   PetscFunctionBegin;
767   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
768   PetscFunctionReturn(PETSC_SUCCESS);
769 }
770 
771 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
772 {
773   Mat_Product                 *product = C->product;
774   Mat                          A, B;
775   bool                         transA, transB; /* use bool, since KK needs this type */
776   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
777   Mat_SeqAIJ                  *c;
778   MatProductData_SeqAIJKokkos *pdata;
779   KokkosCsrMatrix              csrmatA, csrmatB;
780 
781   PetscFunctionBegin;
782   MatCheckProduct(C, 1);
783   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
784   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
785 
786   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
787   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
788   // we still do numeric.
789   if (pdata->reusesym) { // numeric reuses results from symbolic
790     pdata->reusesym = PETSC_FALSE;
791     PetscFunctionReturn(PETSC_SUCCESS);
792   }
793 
794   switch (product->type) {
795   case MATPRODUCT_AB:
796     transA = false;
797     transB = false;
798     break;
799   case MATPRODUCT_AtB:
800     transA = true;
801     transB = false;
802     break;
803   case MATPRODUCT_ABt:
804     transA = false;
805     transB = true;
806     break;
807   default:
808     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
809   }
810 
811   A = product->A;
812   B = product->B;
813   PetscCall(MatSeqAIJKokkosSyncDevice(A));
814   PetscCall(MatSeqAIJKokkosSyncDevice(B));
815   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
816   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
817   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
818 
819   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
820 
821   csrmatA = akok->csrmat;
822   csrmatB = bkok->csrmat;
823 
824   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
825   if (transA) {
826     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
827     transA = false;
828   }
829 
830   if (transB) {
831     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
832     transB = false;
833   }
834   PetscCall(PetscLogGpuTimeBegin());
835   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
836 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
837   auto spgemmHandle = pdata->kh.get_spgemm_handle();
838   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
839 #endif
840 
841   PetscCall(PetscLogGpuTimeEnd());
842   PetscCall(MatSeqAIJKokkosModifyDevice(C));
843   /* shorter version of MatAssemblyEnd_SeqAIJ */
844   c = (Mat_SeqAIJ *)C->data;
845   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
846   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
847   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
848   c->reallocs         = 0;
849   C->info.mallocs     = 0;
850   C->info.nz_unneeded = 0;
851   C->assembled = C->was_assembled = PETSC_TRUE;
852   C->num_ass++;
853   PetscFunctionReturn(PETSC_SUCCESS);
854 }
855 
856 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
857 {
858   Mat_Product                 *product = C->product;
859   MatProductType               ptype;
860   Mat                          A, B;
861   bool                         transA, transB;
862   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
863   MatProductData_SeqAIJKokkos *pdata;
864   MPI_Comm                     comm;
865   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
866 
867   PetscFunctionBegin;
868   MatCheckProduct(C, 1);
869   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
870   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
871   A = product->A;
872   B = product->B;
873   PetscCall(MatSeqAIJKokkosSyncDevice(A));
874   PetscCall(MatSeqAIJKokkosSyncDevice(B));
875   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
876   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
877   csrmatA = akok->csrmat;
878   csrmatB = bkok->csrmat;
879 
880   ptype = product->type;
881   // Take advantage of the symmetry if true
882   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
883     ptype                                          = MATPRODUCT_AB;
884     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
885   }
886   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
887     ptype                                          = MATPRODUCT_AB;
888     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
889   }
890 
891   switch (ptype) {
892   case MATPRODUCT_AB:
893     transA = false;
894     transB = false;
895     PetscCall(MatSetBlockSizesFromMats(C, A, B));
896     break;
897   case MATPRODUCT_AtB:
898     transA = true;
899     transB = false;
900     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
901     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
902     break;
903   case MATPRODUCT_ABt:
904     transA = false;
905     transB = true;
906     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
907     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
908     break;
909   default:
910     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
911   }
912   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
913   pdata->reusesym = product->api_user;
914 
915   /* TODO: add command line options to select spgemm algorithms */
916   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
917 
918   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
919 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
920   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
921   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
922   #endif
923 #endif
924   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
925 
926   PetscCall(PetscLogGpuTimeBegin());
927   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
928   if (transA) {
929     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
930     transA = false;
931   }
932 
933   if (transB) {
934     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
935     transB = false;
936   }
937 
938   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
939   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
940     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
941     calling new Mat_SeqAIJKokkos().
942     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
943   */
944   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
945 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
946   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
947   auto spgemmHandle = pdata->kh.get_spgemm_handle();
948   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
949 #endif
950   PetscCall(PetscLogGpuTimeEnd());
951 
952   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
953   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
954   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
955   PetscFunctionReturn(PETSC_SUCCESS);
956 }
957 
958 /* handles sparse matrix matrix ops */
959 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
960 {
961   Mat_Product *product = mat->product;
962   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
963 
964   PetscFunctionBegin;
965   MatCheckProduct(mat, 1);
966   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
967   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
968   if (Biskok && Ciskok) {
969     switch (product->type) {
970     case MATPRODUCT_AB:
971     case MATPRODUCT_AtB:
972     case MATPRODUCT_ABt:
973       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
974       break;
975     case MATPRODUCT_PtAP:
976     case MATPRODUCT_RARt:
977     case MATPRODUCT_ABC:
978       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
979       break;
980     default:
981       break;
982     }
983   } else { /* fallback for AIJ */
984     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
985   }
986   PetscFunctionReturn(PETSC_SUCCESS);
987 }
988 
989 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
990 {
991   Mat_SeqAIJKokkos *aijkok;
992 
993   PetscFunctionBegin;
994   PetscCall(PetscLogGpuTimeBegin());
995   PetscCall(MatSeqAIJKokkosSyncDevice(A));
996   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
997   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
998   PetscCall(MatSeqAIJKokkosModifyDevice(A));
999   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1000   PetscCall(PetscLogGpuTimeEnd());
1001   PetscFunctionReturn(PETSC_SUCCESS);
1002 }
1003 
1004 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1005 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1006 {
1007   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1008 
1009   PetscFunctionBegin;
1010   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1011     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1012 
1013     PetscCall(PetscLogGpuTimeBegin());
1014     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1015     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1016     const auto &Aa     = aijkok->a_dual.view_device();
1017     const auto &Adiag  = aijkok->diag_dual.view_device();
1018     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1019     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1020     PetscCall(PetscLogGpuFlops(n));
1021     PetscCall(PetscLogGpuTimeEnd());
1022   } else { // need reassembly, very slow!
1023     PetscCall(MatShift_Basic(A, a));
1024   }
1025   PetscFunctionReturn(PETSC_SUCCESS);
1026 }
1027 
1028 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1029 {
1030   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1031 
1032   PetscFunctionBegin;
1033   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1034     ConstPetscScalarKokkosView dv;
1035     PetscInt                   n, nv;
1036 
1037     PetscCall(PetscLogGpuTimeBegin());
1038     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1039     PetscCall(VecGetKokkosView(D, &dv));
1040     PetscCall(VecGetLocalSize(D, &nv));
1041     n = PetscMin(Y->rmap->n, Y->cmap->n);
1042     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1043 
1044     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1045     const auto &Aa     = aijkok->a_dual.view_device();
1046     const auto &Adiag  = aijkok->diag_dual.view_device();
1047     PetscCallCXX(Kokkos::parallel_for(
1048       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1049         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1050         else Aa(Adiag(i)) += dv(i);
1051       }));
1052     PetscCall(VecRestoreKokkosView(D, &dv));
1053     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1054     PetscCall(PetscLogGpuFlops(n));
1055     PetscCall(PetscLogGpuTimeEnd());
1056   } else { // need reassembly, very slow!
1057     PetscCall(MatDiagonalSet_Default(Y, D, is));
1058   }
1059   PetscFunctionReturn(PETSC_SUCCESS);
1060 }
1061 
1062 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1063 {
1064   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1065   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1066   ConstPetscScalarKokkosView lv, rv;
1067 
1068   PetscFunctionBegin;
1069   PetscCall(PetscLogGpuTimeBegin());
1070   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1071   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1072   const auto &Aa     = aijkok->a_dual.view_device();
1073   const auto &Ai     = aijkok->i_dual.view_device();
1074   const auto &Aj     = aijkok->j_dual.view_device();
1075   if (ll) {
1076     PetscCall(VecGetLocalSize(ll, &m));
1077     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1078     PetscCall(VecGetKokkosView(ll, &lv));
1079     PetscCallCXX(Kokkos::parallel_for( // for each row
1080       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1081         PetscInt i   = t.league_rank(); // row i
1082         PetscInt len = Ai(i + 1) - Ai(i);
1083         // scale entries on the row
1084         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1085       }));
1086     PetscCall(VecRestoreKokkosView(ll, &lv));
1087     PetscCall(PetscLogGpuFlops(nz));
1088   }
1089   if (rr) {
1090     PetscCall(VecGetLocalSize(rr, &n));
1091     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1092     PetscCall(VecGetKokkosView(rr, &rv));
1093     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1094       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1095     PetscCall(VecRestoreKokkosView(rr, &lv));
1096     PetscCall(PetscLogGpuFlops(nz));
1097   }
1098   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1099   PetscCall(PetscLogGpuTimeEnd());
1100   PetscFunctionReturn(PETSC_SUCCESS);
1101 }
1102 
1103 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1104 {
1105   Mat_SeqAIJKokkos *aijkok;
1106 
1107   PetscFunctionBegin;
1108   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1109   if (aijkok) { /* Only zero the device if data is already there */
1110     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1111     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1112   } else { /* Might be preallocated but not assembled */
1113     PetscCall(MatZeroEntries_SeqAIJ(A));
1114   }
1115   PetscFunctionReturn(PETSC_SUCCESS);
1116 }
1117 
1118 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1119 {
1120   Mat_SeqAIJKokkos     *aijkok;
1121   PetscInt              n;
1122   PetscScalarKokkosView xv;
1123 
1124   PetscFunctionBegin;
1125   PetscCall(VecGetLocalSize(x, &n));
1126   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1127   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1128 
1129   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1130   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1131 
1132   const auto &Aa    = aijkok->a_dual.view_device();
1133   const auto &Ai    = aijkok->i_dual.view_device();
1134   const auto &Adiag = aijkok->diag_dual.view_device();
1135 
1136   PetscCall(VecGetKokkosViewWrite(x, &xv));
1137   Kokkos::parallel_for(
1138     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1139       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1140       else xv(i) = 0;
1141     });
1142   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1143   PetscFunctionReturn(PETSC_SUCCESS);
1144 }
1145 
1146 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1147 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1148 {
1149   Mat_SeqAIJKokkos *aijkok;
1150 
1151   PetscFunctionBegin;
1152   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1153   PetscAssertPointer(kv, 2);
1154   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1155   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1156   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1157   *kv    = aijkok->a_dual.view_device();
1158   PetscFunctionReturn(PETSC_SUCCESS);
1159 }
1160 
1161 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1162 {
1163   PetscFunctionBegin;
1164   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1165   PetscAssertPointer(kv, 2);
1166   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1167   PetscFunctionReturn(PETSC_SUCCESS);
1168 }
1169 
1170 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1171 {
1172   Mat_SeqAIJKokkos *aijkok;
1173 
1174   PetscFunctionBegin;
1175   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1176   PetscAssertPointer(kv, 2);
1177   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1178   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1179   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1180   *kv    = aijkok->a_dual.view_device();
1181   PetscFunctionReturn(PETSC_SUCCESS);
1182 }
1183 
1184 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1185 {
1186   PetscFunctionBegin;
1187   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1188   PetscAssertPointer(kv, 2);
1189   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1190   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1191   PetscFunctionReturn(PETSC_SUCCESS);
1192 }
1193 
1194 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1195 {
1196   Mat_SeqAIJKokkos *aijkok;
1197 
1198   PetscFunctionBegin;
1199   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1200   PetscAssertPointer(kv, 2);
1201   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1202   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1203   *kv    = aijkok->a_dual.view_device();
1204   PetscFunctionReturn(PETSC_SUCCESS);
1205 }
1206 
1207 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1208 {
1209   PetscFunctionBegin;
1210   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1211   PetscAssertPointer(kv, 2);
1212   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1213   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1214   PetscFunctionReturn(PETSC_SUCCESS);
1215 }
1216 
1217 /* Computes Y += alpha X */
1218 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1219 {
1220   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1221   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1222   ConstMatScalarKokkosView Xa;
1223   MatScalarKokkosView      Ya;
1224   auto                     exec = PetscGetKokkosExecutionSpace();
1225 
1226   PetscFunctionBegin;
1227   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1228   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1229   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1230   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1231   PetscCall(PetscLogGpuTimeBegin());
1232 
1233   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1234     PetscBool e;
1235     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1236     if (e) {
1237       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1238       if (e) pattern = SAME_NONZERO_PATTERN;
1239     }
1240   }
1241 
1242   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1243     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1244     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1245     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1246   */
1247   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1248   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1249   Xa   = xkok->a_dual.view_device();
1250   Ya   = ykok->a_dual.view_device();
1251 
1252   if (pattern == SAME_NONZERO_PATTERN) {
1253     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1254     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1255   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1256     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1257     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1258 
1259     Kokkos::parallel_for(
1260       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1261         PetscInt i = t.league_rank(); // row i
1262         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1263           // Only one thread works in a team
1264           PetscInt p, q = Yi(i);
1265           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1266             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1267             if (Xj(p) == Yj(q)) {                        // Found it
1268               Ya(q) += alpha * Xa(p);
1269               q++;
1270             } else {
1271             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1272             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1273 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1274               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1275 #else
1276               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1277 #endif
1278             }
1279           }
1280         });
1281       });
1282     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1283   } else { // different nonzero patterns
1284     Mat             Z;
1285     KokkosCsrMatrix zcsr;
1286     KernelHandle    kh;
1287     kh.create_spadd_handle(true); // X, Y are sorted
1288     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1289     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1290     zkok = new Mat_SeqAIJKokkos(zcsr);
1291     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1292     PetscCall(MatHeaderReplace(Y, &Z));
1293     kh.destroy_spadd_handle();
1294   }
1295   PetscCall(PetscLogGpuTimeEnd());
1296   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1297   PetscFunctionReturn(PETSC_SUCCESS);
1298 }
1299 
1300 struct MatCOOStruct_SeqAIJKokkos {
1301   PetscCount           n;
1302   PetscCount           Atot;
1303   PetscInt             nz;
1304   PetscCountKokkosView jmap;
1305   PetscCountKokkosView perm;
1306 
1307   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1308   {
1309     nz   = coo_h->nz;
1310     n    = coo_h->n;
1311     Atot = coo_h->Atot;
1312     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1313     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1314   }
1315 };
1316 
1317 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1318 {
1319   PetscFunctionBegin;
1320   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1321   PetscFunctionReturn(PETSC_SUCCESS);
1322 }
1323 
1324 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1325 {
1326   Mat_SeqAIJKokkos          *akok;
1327   Mat_SeqAIJ                *aseq;
1328   PetscContainer             container_h;
1329   MatCOOStruct_SeqAIJ       *coo_h;
1330   MatCOOStruct_SeqAIJKokkos *coo_d;
1331 
1332   PetscFunctionBegin;
1333   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1334   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1335   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1336   delete akok;
1337   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1338   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1339 
1340   // Copy the COO struct to device
1341   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1342   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1343   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1344 
1345   // Put the COO struct in a container and then attach that to the matrix
1346   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1347   PetscFunctionReturn(PETSC_SUCCESS);
1348 }
1349 
1350 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1351 {
1352   MatScalarKokkosView        Aa;
1353   ConstMatScalarKokkosView   kv;
1354   PetscMemType               memtype;
1355   PetscContainer             container;
1356   MatCOOStruct_SeqAIJKokkos *coo;
1357 
1358   PetscFunctionBegin;
1359   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1360   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1361 
1362   const auto &n    = coo->n;
1363   const auto &Annz = coo->nz;
1364   const auto &jmap = coo->jmap;
1365   const auto &perm = coo->perm;
1366 
1367   PetscCall(PetscGetMemType(v, &memtype));
1368   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1369     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1370   } else {
1371     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1372   }
1373 
1374   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1375   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1376 
1377   PetscCall(PetscLogGpuTimeBegin());
1378   Kokkos::parallel_for(
1379     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1380       PetscScalar sum = 0.0;
1381       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1382       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1383     });
1384   PetscCall(PetscLogGpuTimeEnd());
1385 
1386   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1387   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1388   PetscFunctionReturn(PETSC_SUCCESS);
1389 }
1390 
1391 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1392 {
1393   PetscFunctionBegin;
1394   PetscCall(MatSeqAIJKokkosSyncHost(A));
1395   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1396   B->offloadmask = PETSC_OFFLOAD_CPU;
1397   PetscFunctionReturn(PETSC_SUCCESS);
1398 }
1399 
1400 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1401 {
1402   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1403 
1404   PetscFunctionBegin;
1405   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1406   A->boundtocpu  = PETSC_FALSE;
1407 
1408   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1409   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1410   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1411   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1412   A->ops->scale                     = MatScale_SeqAIJKokkos;
1413   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1414   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1415   A->ops->mult                      = MatMult_SeqAIJKokkos;
1416   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1417   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1418   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1419   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1420   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1421   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1422   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1423   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1424   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1425   A->ops->shift                     = MatShift_SeqAIJKokkos;
1426   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1427   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1428   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1429   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1430   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1431   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1432   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1433   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1434   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1435 
1436   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1437   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1438 #if defined(PETSC_HAVE_HYPRE)
1439   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1440 #endif
1441   PetscFunctionReturn(PETSC_SUCCESS);
1442 }
1443 
1444 /*
1445    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1446 
1447   Input Parameters:
1448 +  A       - the MATSEQAIJKOKKOS matrix
1449 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1450 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1451 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1452 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1453 
1454   Output Parameter:
1455 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1456 */
1457 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1458 {
1459   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1460   PetscInt          nblocks = bs.extent(0) - 1;
1461 
1462   PetscFunctionBegin;
1463   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1464 
1465   // Pull out the diagonal blocks of the matrix and then invert the blocks
1466   auto Aa    = akok->a_dual.view_device();
1467   auto Ai    = akok->i_dual.view_device();
1468   auto Aj    = akok->j_dual.view_device();
1469   auto Adiag = akok->diag_dual.view_device();
1470   // TODO: how to tune the team size?
1471 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1472   auto ts = Kokkos::AUTO();
1473 #else
1474   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1475 #endif
1476   PetscCallCXX(Kokkos::parallel_for(
1477     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1478       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1479       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1480       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1481       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1482       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1483 
1484       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1485         PetscInt i = rstart + r;                                                            // i-th row in A
1486 
1487         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1488           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1489 
1490           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1491             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1492               B(r, c) = 0.0;
1493             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1494               B(r, c) = Aa(first + c);
1495             } else { // this entry does not show up in the CSR
1496               B(r, c) = 0.0;
1497             }
1498           }
1499         } else { // rare case that the diagonal does not exist
1500           const PetscInt begin = Ai(i);
1501           const PetscInt end   = Ai(i + 1);
1502           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1503           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1504             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1505             else if (Aj(j) >= rstart + m) break;
1506           }
1507         }
1508       });
1509 
1510       // LU-decompose B (w/o pivoting) and then invert B
1511       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1512       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1513     }));
1514   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1515   PetscFunctionReturn(PETSC_SUCCESS);
1516 }
1517 
1518 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1519 {
1520   Mat_SeqAIJ *aseq;
1521   PetscInt    i, m, n;
1522   auto        exec = PetscGetKokkosExecutionSpace();
1523 
1524   PetscFunctionBegin;
1525   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1526 
1527   m = akok->nrows();
1528   n = akok->ncols();
1529   PetscCall(MatSetSizes(A, m, n, m, n));
1530   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1531 
1532   /* Set up data structures of A as a MATSEQAIJ */
1533   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1534   aseq = (Mat_SeqAIJ *)A->data;
1535 
1536   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1537   PetscCallCXX(akok->j_dual.sync_host(exec));
1538   PetscCallCXX(exec.fence());
1539 
1540   aseq->i       = akok->i_host_data();
1541   aseq->j       = akok->j_host_data();
1542   aseq->a       = akok->a_host_data();
1543   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1544   aseq->free_a  = PETSC_FALSE;
1545   aseq->free_ij = PETSC_FALSE;
1546   aseq->nz      = akok->nnz();
1547   aseq->maxnz   = aseq->nz;
1548 
1549   PetscCall(PetscMalloc1(m, &aseq->imax));
1550   PetscCall(PetscMalloc1(m, &aseq->ilen));
1551   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1552 
1553   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1554   akok->nonzerostate = A->nonzerostate;
1555   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1556   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1557   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1558   PetscFunctionReturn(PETSC_SUCCESS);
1559 }
1560 
1561 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1562 {
1563   PetscFunctionBegin;
1564   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1565   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1566   PetscFunctionReturn(PETSC_SUCCESS);
1567 }
1568 
1569 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1570 {
1571   Mat_SeqAIJKokkos *akok;
1572 
1573   PetscFunctionBegin;
1574   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1575   PetscCall(MatCreate(comm, A));
1576   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1577   PetscFunctionReturn(PETSC_SUCCESS);
1578 }
1579 
1580 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1581 
1582    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1583  */
1584 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1585 {
1586   PetscFunctionBegin;
1587   PetscCall(MatCreate(comm, A));
1588   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1589   PetscFunctionReturn(PETSC_SUCCESS);
1590 }
1591 
1592 /*@C
1593   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1594   (the default parallel PETSc format). This matrix will ultimately be handled by
1595   Kokkos for calculations.
1596 
1597   Collective
1598 
1599   Input Parameters:
1600 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1601 . m    - number of rows
1602 . n    - number of columns
1603 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1604 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1605 
1606   Output Parameter:
1607 . A - the matrix
1608 
1609   Level: intermediate
1610 
1611   Notes:
1612   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1613   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1614   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1615 
1616   The AIJ format, also called
1617   compressed row storage, is fully compatible with standard Fortran
1618   storage.  That is, the stored row and column indices can begin at
1619   either one (as in Fortran) or zero.
1620 
1621   Specify the preallocated storage with either `nz` or `nnz` (not both).
1622   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1623   allocation.
1624 
1625 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1626 @*/
1627 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1628 {
1629   PetscFunctionBegin;
1630   PetscCall(PetscKokkosInitializeCheck());
1631   PetscCall(MatCreate(comm, A));
1632   PetscCall(MatSetSizes(*A, m, n, m, n));
1633   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1634   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1635   PetscFunctionReturn(PETSC_SUCCESS);
1636 }
1637 
1638 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1639 {
1640   PetscFunctionBegin;
1641   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1642   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1643   PetscFunctionReturn(PETSC_SUCCESS);
1644 }
1645 
1646 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1647 {
1648   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1649 
1650   PetscFunctionBegin;
1651   if (!factors->sptrsv_symbolic_completed) {
1652     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1653     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1654     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1655   }
1656   PetscFunctionReturn(PETSC_SUCCESS);
1657 }
1658 
1659 /* Check if we need to update factors etc for transpose solve */
1660 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1661 {
1662   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1663   MatColIdxType               n       = A->rmap->n;
1664 
1665   PetscFunctionBegin;
1666   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1667     /* Update L^T and do sptrsv symbolic */
1668     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1669     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1670     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1671 
1672     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1673                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1674 
1675     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1676       We have to sort the indices, until KK provides finer control options.
1677     */
1678     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1679 
1680     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1681 
1682     /* Update U^T and do sptrsv symbolic */
1683     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1684     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1685     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1686 
1687     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1688                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1689 
1690     /* Sort indices. See comments above */
1691     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1692 
1693     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1694     factors->transpose_updated = PETSC_TRUE;
1695   }
1696   PetscFunctionReturn(PETSC_SUCCESS);
1697 }
1698 
1699 /* Solve Ax = b, with A = LU */
1700 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1701 {
1702   ConstPetscScalarKokkosView  bv;
1703   PetscScalarKokkosView       xv;
1704   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1705 
1706   PetscFunctionBegin;
1707   PetscCall(PetscLogGpuTimeBegin());
1708   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1709   PetscCall(VecGetKokkosView(b, &bv));
1710   PetscCall(VecGetKokkosViewWrite(x, &xv));
1711   /* Solve L tmpv = b */
1712   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1713   /* Solve Ux = tmpv */
1714   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1715   PetscCall(VecRestoreKokkosView(b, &bv));
1716   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1717   PetscCall(PetscLogGpuTimeEnd());
1718   PetscFunctionReturn(PETSC_SUCCESS);
1719 }
1720 
1721 /* Solve A^T x = b, where A^T = U^T L^T */
1722 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1723 {
1724   ConstPetscScalarKokkosView  bv;
1725   PetscScalarKokkosView       xv;
1726   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1727 
1728   PetscFunctionBegin;
1729   PetscCall(PetscLogGpuTimeBegin());
1730   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1731   PetscCall(VecGetKokkosView(b, &bv));
1732   PetscCall(VecGetKokkosViewWrite(x, &xv));
1733   /* Solve U^T tmpv = b */
1734   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1735 
1736   /* Solve L^T x = tmpv */
1737   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1738   PetscCall(VecRestoreKokkosView(b, &bv));
1739   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1740   PetscCall(PetscLogGpuTimeEnd());
1741   PetscFunctionReturn(PETSC_SUCCESS);
1742 }
1743 
1744 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1745 {
1746   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1747   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1748   PetscInt                    fill_lev = info->levels;
1749 
1750   PetscFunctionBegin;
1751   PetscCall(PetscLogGpuTimeBegin());
1752   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1753 
1754   auto a_d = aijkok->a_dual.view_device();
1755   auto i_d = aijkok->i_dual.view_device();
1756   auto j_d = aijkok->j_dual.view_device();
1757 
1758   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1759 
1760   B->assembled              = PETSC_TRUE;
1761   B->preallocated           = PETSC_TRUE;
1762   B->ops->solve             = MatSolve_SeqAIJKokkos;
1763   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1764   B->ops->matsolve          = NULL;
1765   B->ops->matsolvetranspose = NULL;
1766   B->offloadmask            = PETSC_OFFLOAD_GPU;
1767 
1768   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1769   factors->transpose_updated         = PETSC_FALSE;
1770   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1771   /* TODO: log flops, but how to know that? */
1772   PetscCall(PetscLogGpuTimeEnd());
1773   PetscFunctionReturn(PETSC_SUCCESS);
1774 }
1775 
1776 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1777 {
1778   Mat_SeqAIJKokkos           *aijkok;
1779   Mat_SeqAIJ                 *b;
1780   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1781   PetscInt                    fill_lev = info->levels;
1782   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1783   PetscInt                    n        = A->rmap->n;
1784 
1785   PetscFunctionBegin;
1786   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1787   /* Rebuild factors */
1788   if (factors) {
1789     factors->Destroy();
1790   } /* Destroy the old if it exists */
1791   else {
1792     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1793   }
1794 
1795   /* Create a spiluk handle and then do symbolic factorization */
1796   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1797   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1798 
1799   auto spiluk_handle = factors->kh.get_spiluk_handle();
1800 
1801   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1802   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1803   Kokkos::realloc(factors->iU_d, n + 1);
1804   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1805 
1806   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1807   auto i_d = aijkok->i_dual.view_device();
1808   auto j_d = aijkok->j_dual.view_device();
1809   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1810   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1811 
1812   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1813   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1814   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1815   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1816 
1817   /* TODO: add options to select sptrsv algorithms */
1818   /* Create sptrsv handles for L, U and their transpose */
1819 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1820   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1821 #else
1822   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1823 #endif
1824 
1825   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1826   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1827   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1828   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1829 
1830   /* Fill fields of the factor matrix B */
1831   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1832   b     = (Mat_SeqAIJ *)B->data;
1833   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1834   B->info.fill_ratio_given  = info->fill;
1835   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1836 
1837   B->offloadmask          = PETSC_OFFLOAD_GPU;
1838   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1839   PetscFunctionReturn(PETSC_SUCCESS);
1840 }
1841 
1842 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1843 {
1844   PetscFunctionBegin;
1845   *type = MATSOLVERKOKKOS;
1846   PetscFunctionReturn(PETSC_SUCCESS);
1847 }
1848 
1849 /*MC
1850   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1851   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1852 
1853   Level: beginner
1854 
1855 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1856 M*/
1857 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1858 {
1859   PetscInt n = A->rmap->n;
1860 
1861   PetscFunctionBegin;
1862   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1863   PetscCall(MatSetSizes(*B, n, n, n, n));
1864   (*B)->factortype = ftype;
1865   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1866   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1867 
1868   if (ftype == MAT_FACTOR_LU) {
1869     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1870     (*B)->canuseordering        = PETSC_TRUE;
1871     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1872   } else if (ftype == MAT_FACTOR_ILU) {
1873     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1874     (*B)->canuseordering         = PETSC_FALSE;
1875     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1876   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1877 
1878   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1879   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1880   PetscFunctionReturn(PETSC_SUCCESS);
1881 }
1882 
1883 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1884 {
1885   PetscFunctionBegin;
1886   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1887   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1888   PetscFunctionReturn(PETSC_SUCCESS);
1889 }
1890 
1891 /* Utility to print out a KokkosCsrMatrix for debugging */
1892 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1893 {
1894   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
1895   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
1896   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
1897   const PetscInt    *i  = iv.data();
1898   const PetscInt    *j  = jv.data();
1899   const PetscScalar *a  = av.data();
1900   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1901 
1902   PetscFunctionBegin;
1903   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1904   for (PetscInt k = 0; k < m; k++) {
1905     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1906     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1907     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1908   }
1909   PetscFunctionReturn(PETSC_SUCCESS);
1910 }
1911