xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision d9acb416d05abeed0a33bde3a81aeb2ea0364f6a)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscpkg_version.h>
4 #include <petsc/private/petscimpl.h>
5 #include <petsc/private/sfimpl.h>
6 #include <petscsystypes.h>
7 #include <petscerror.h>
8 
9 #include <Kokkos_Core.hpp>
10 #include <KokkosBlas.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 #include <KokkosBatched_LU_Decl.hpp>
18 #include <KokkosBatched_InverseLU_Decl.hpp>
19 
20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
21 
22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
23   #include <KokkosSparse_Utils.hpp>
24 using KokkosSparse::sort_crs_matrix;
25 using KokkosSparse::Impl::transpose_matrix;
26 #else
27   #include <KokkosKernels_Sorting.hpp>
28 using KokkosKernels::sort_crs_matrix;
29 using KokkosKernels::Impl::transpose_matrix;
30 #endif
31 
32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
33 
34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
35    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
36    In the latter case, it is important to set a_dual's sync state correctly.
37  */
38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
39 {
40   Mat_SeqAIJ       *aijseq;
41   Mat_SeqAIJKokkos *aijkok;
42 
43   PetscFunctionBegin;
44   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
45   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
46 
47   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
48   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
49 
50   /* If aijkok does not exist, we just copy i, j to device.
51      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
52      In both cases, we build a new aijkok structure.
53   */
54   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
55     delete aijkok;
56     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
57     A->spptr = aijkok;
58   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
59     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
60     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
61     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
62   }
63   PetscFunctionReturn(PETSC_SUCCESS);
64 }
65 
66 /* Sync CSR data to device if not yet */
67 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
68 {
69   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
70 
71   PetscFunctionBegin;
72   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
73   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
74   if (aijkok->a_dual.need_sync_device()) {
75     aijkok->a_dual.sync_device();
76     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
77     aijkok->hermitian_updated = PETSC_FALSE;
78   }
79   PetscFunctionReturn(PETSC_SUCCESS);
80 }
81 
82 /* Mark the CSR data on device as modified */
83 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
84 {
85   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
86 
87   PetscFunctionBegin;
88   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
89   aijkok->a_dual.clear_sync_state();
90   aijkok->a_dual.modify_device();
91   aijkok->transpose_updated = PETSC_FALSE;
92   aijkok->hermitian_updated = PETSC_FALSE;
93   PetscCall(MatSeqAIJInvalidateDiagonal(A));
94   PetscCall(PetscObjectStateIncrease((PetscObject)A));
95   PetscFunctionReturn(PETSC_SUCCESS);
96 }
97 
98 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
99 {
100   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
101   auto             &exec   = PetscGetKokkosExecutionSpace();
102 
103   PetscFunctionBegin;
104   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
105   /* We do not expect one needs factors on host  */
106   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
107   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
108   PetscCallCXX(aijkok->a_dual.sync_host(exec));
109   PetscCallCXX(exec.fence());
110   PetscFunctionReturn(PETSC_SUCCESS);
111 }
112 
113 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
114 {
115   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
116 
117   PetscFunctionBegin;
118   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
119     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
120     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
121     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
122   */
123   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
124     auto &exec = PetscGetKokkosExecutionSpace();
125     PetscCallCXX(aijkok->a_dual.sync_host(exec));
126     PetscCallCXX(exec.fence());
127     *array = aijkok->a_dual.view_host().data();
128   } else { /* Happens when calling MatSetValues on a newly created matrix */
129     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
130   }
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
135 {
136   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
137 
138   PetscFunctionBegin;
139   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
140   PetscFunctionReturn(PETSC_SUCCESS);
141 }
142 
143 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
144 {
145   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
146 
147   PetscFunctionBegin;
148   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149     auto &exec = PetscGetKokkosExecutionSpace();
150     PetscCallCXX(aijkok->a_dual.sync_host(exec));
151     PetscCallCXX(exec.fence());
152     *array = aijkok->a_dual.view_host().data();
153   } else {
154     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
155   }
156   PetscFunctionReturn(PETSC_SUCCESS);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
160 {
161   PetscFunctionBegin;
162   *array = NULL;
163   PetscFunctionReturn(PETSC_SUCCESS);
164 }
165 
166 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
167 {
168   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
169 
170   PetscFunctionBegin;
171   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
172     *array = aijkok->a_dual.view_host().data();
173   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
174     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
175   }
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
180 {
181   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
182 
183   PetscFunctionBegin;
184   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
185     aijkok->a_dual.clear_sync_state();
186     aijkok->a_dual.modify_host();
187   }
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
192 {
193   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
194 
195   PetscFunctionBegin;
196   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
197 
198   if (i) *i = aijkok->i_device_data();
199   if (j) *j = aijkok->j_device_data();
200   if (a) {
201     aijkok->a_dual.sync_device();
202     *a = aijkok->a_device_data();
203   }
204   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
205   PetscFunctionReturn(PETSC_SUCCESS);
206 }
207 
208 /*
209   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
210 
211   Input Parameter:
212 .  A       - the MATSEQAIJKOKKOS matrix
213 
214   Output Parameters:
215 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
216 -  T_d    - the transpose on device, whose value array is allocated but not initialized
217 */
218 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
219 {
220   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
221   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
222   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
223   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
224   MatRowMapType          *Ti = Ti_h.data();
225   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
226   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
227   PetscInt               *Tj   = Tj_h.data();
228   PetscInt               *perm = perm_h.data();
229   PetscInt               *offset;
230 
231   PetscFunctionBegin;
232   // Populate Ti
233   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
234   Ti++;
235   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
236   Ti--;
237   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
238 
239   // Populate Tj and the permutation array
240   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
241   for (PetscInt i = 0; i < m; i++) {
242     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
243       PetscInt r    = Aj[j];                       // row r of T
244       PetscInt disp = Ti[r] + offset[r];
245 
246       Tj[disp]   = i; // col i of T
247       perm[disp] = j;
248       offset[r]++;
249     }
250   }
251   PetscCall(PetscFree(offset));
252 
253   // Sort each row of T, along with the permutation array
254   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
255 
256   // Output perm and T on device
257   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
258   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
259   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
260   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
261   PetscFunctionReturn(PETSC_SUCCESS);
262 }
263 
264 // Generate the transpose on device and cache it internally
265 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
266 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
267 {
268   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
269   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
270   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
271   KokkosCsrMatrix  &T = akok->csrmatT;
272 
273   PetscFunctionBegin;
274   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
275   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
276 
277   const auto &Aa = akok->a_dual.view_device();
278 
279   if (A->symmetric == PETSC_BOOL3_TRUE) {
280     *csrmatT = akok->csrmat;
281   } else {
282     // See if we already have a cached transpose and its value is up to date
283     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
284       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
285         const auto &perm = akok->transpose_perm; // get the permutation array
286         auto       &Ta   = T.values;
287 
288         PetscCallCXX(Kokkos::parallel_for(
289           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
290       }
291     } else { // Generate T of size n x m for the first time
292       MatRowMapKokkosView perm;
293 
294       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
295       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
296       PetscCallCXX(Kokkos::parallel_for(
297         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
298     }
299     akok->transpose_updated = PETSC_TRUE;
300     *csrmatT                = akok->csrmatT;
301   }
302   PetscFunctionReturn(PETSC_SUCCESS);
303 }
304 
305 // Generate the Hermitian on device and cache it internally
306 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
307 {
308   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
309   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
310   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
311   KokkosCsrMatrix  &T = akok->csrmatH;
312 
313   PetscFunctionBegin;
314   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
315   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
316 
317   const auto &Aa = akok->a_dual.view_device();
318 
319   if (A->hermitian == PETSC_BOOL3_TRUE) {
320     *csrmatH = akok->csrmat;
321   } else {
322     // See if we already have a cached hermitian and its value is up to date
323     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
324       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
325         const auto &perm = akok->transpose_perm; // get the permutation array
326         auto       &Ta   = T.values;
327 
328         PetscCallCXX(Kokkos::parallel_for(
329           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
330       }
331     } else { // Generate T of size n x m for the first time
332       MatRowMapKokkosView perm;
333 
334       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
335       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
336       PetscCallCXX(Kokkos::parallel_for(
337         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
338     }
339     akok->hermitian_updated = PETSC_TRUE;
340     *csrmatH                = akok->csrmatH;
341   }
342   PetscFunctionReturn(PETSC_SUCCESS);
343 }
344 
345 /* y = A x */
346 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
347 {
348   Mat_SeqAIJKokkos          *aijkok;
349   ConstPetscScalarKokkosView xv;
350   PetscScalarKokkosView      yv;
351 
352   PetscFunctionBegin;
353   PetscCall(PetscLogGpuTimeBegin());
354   PetscCall(MatSeqAIJKokkosSyncDevice(A));
355   PetscCall(VecGetKokkosView(xx, &xv));
356   PetscCall(VecGetKokkosViewWrite(yy, &yv));
357   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
358   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
359   PetscCall(VecRestoreKokkosView(xx, &xv));
360   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
361   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
362   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
363   PetscCall(PetscLogGpuTimeEnd());
364   PetscFunctionReturn(PETSC_SUCCESS);
365 }
366 
367 /* y = A^T x */
368 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
369 {
370   Mat_SeqAIJKokkos          *aijkok;
371   const char                *mode;
372   ConstPetscScalarKokkosView xv;
373   PetscScalarKokkosView      yv;
374   KokkosCsrMatrix            csrmat;
375 
376   PetscFunctionBegin;
377   PetscCall(PetscLogGpuTimeBegin());
378   PetscCall(MatSeqAIJKokkosSyncDevice(A));
379   PetscCall(VecGetKokkosView(xx, &xv));
380   PetscCall(VecGetKokkosViewWrite(yy, &yv));
381   if (A->form_explicit_transpose) {
382     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
383     mode = "N";
384   } else {
385     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
386     csrmat = aijkok->csrmat;
387     mode   = "T";
388   }
389   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
390   PetscCall(VecRestoreKokkosView(xx, &xv));
391   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
392   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
393   PetscCall(PetscLogGpuTimeEnd());
394   PetscFunctionReturn(PETSC_SUCCESS);
395 }
396 
397 /* y = A^H x */
398 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
399 {
400   Mat_SeqAIJKokkos          *aijkok;
401   const char                *mode;
402   ConstPetscScalarKokkosView xv;
403   PetscScalarKokkosView      yv;
404   KokkosCsrMatrix            csrmat;
405 
406   PetscFunctionBegin;
407   PetscCall(PetscLogGpuTimeBegin());
408   PetscCall(MatSeqAIJKokkosSyncDevice(A));
409   PetscCall(VecGetKokkosView(xx, &xv));
410   PetscCall(VecGetKokkosViewWrite(yy, &yv));
411   if (A->form_explicit_transpose) {
412     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
413     mode = "N";
414   } else {
415     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
416     csrmat = aijkok->csrmat;
417     mode   = "C";
418   }
419   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
420   PetscCall(VecRestoreKokkosView(xx, &xv));
421   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
422   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
423   PetscCall(PetscLogGpuTimeEnd());
424   PetscFunctionReturn(PETSC_SUCCESS);
425 }
426 
427 /* z = A x + y */
428 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
429 {
430   Mat_SeqAIJKokkos          *aijkok;
431   ConstPetscScalarKokkosView xv, yv;
432   PetscScalarKokkosView      zv;
433 
434   PetscFunctionBegin;
435   PetscCall(PetscLogGpuTimeBegin());
436   PetscCall(MatSeqAIJKokkosSyncDevice(A));
437   PetscCall(VecGetKokkosView(xx, &xv));
438   PetscCall(VecGetKokkosView(yy, &yv));
439   PetscCall(VecGetKokkosViewWrite(zz, &zv));
440   if (zz != yy) Kokkos::deep_copy(zv, yv);
441   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
442   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
443   PetscCall(VecRestoreKokkosView(xx, &xv));
444   PetscCall(VecRestoreKokkosView(yy, &yv));
445   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
446   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
447   PetscCall(PetscLogGpuTimeEnd());
448   PetscFunctionReturn(PETSC_SUCCESS);
449 }
450 
451 /* z = A^T x + y */
452 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
453 {
454   Mat_SeqAIJKokkos          *aijkok;
455   const char                *mode;
456   ConstPetscScalarKokkosView xv, yv;
457   PetscScalarKokkosView      zv;
458   KokkosCsrMatrix            csrmat;
459 
460   PetscFunctionBegin;
461   PetscCall(PetscLogGpuTimeBegin());
462   PetscCall(MatSeqAIJKokkosSyncDevice(A));
463   PetscCall(VecGetKokkosView(xx, &xv));
464   PetscCall(VecGetKokkosView(yy, &yv));
465   PetscCall(VecGetKokkosViewWrite(zz, &zv));
466   if (zz != yy) Kokkos::deep_copy(zv, yv);
467   if (A->form_explicit_transpose) {
468     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
469     mode = "N";
470   } else {
471     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
472     csrmat = aijkok->csrmat;
473     mode   = "T";
474   }
475   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
476   PetscCall(VecRestoreKokkosView(xx, &xv));
477   PetscCall(VecRestoreKokkosView(yy, &yv));
478   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
479   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
480   PetscCall(PetscLogGpuTimeEnd());
481   PetscFunctionReturn(PETSC_SUCCESS);
482 }
483 
484 /* z = A^H x + y */
485 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
486 {
487   Mat_SeqAIJKokkos          *aijkok;
488   const char                *mode;
489   ConstPetscScalarKokkosView xv, yv;
490   PetscScalarKokkosView      zv;
491   KokkosCsrMatrix            csrmat;
492 
493   PetscFunctionBegin;
494   PetscCall(PetscLogGpuTimeBegin());
495   PetscCall(MatSeqAIJKokkosSyncDevice(A));
496   PetscCall(VecGetKokkosView(xx, &xv));
497   PetscCall(VecGetKokkosView(yy, &yv));
498   PetscCall(VecGetKokkosViewWrite(zz, &zv));
499   if (zz != yy) Kokkos::deep_copy(zv, yv);
500   if (A->form_explicit_transpose) {
501     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
502     mode = "N";
503   } else {
504     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
505     csrmat = aijkok->csrmat;
506     mode   = "C";
507   }
508   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
509   PetscCall(VecRestoreKokkosView(xx, &xv));
510   PetscCall(VecRestoreKokkosView(yy, &yv));
511   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
512   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
513   PetscCall(PetscLogGpuTimeEnd());
514   PetscFunctionReturn(PETSC_SUCCESS);
515 }
516 
517 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
518 {
519   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
520 
521   PetscFunctionBegin;
522   switch (op) {
523   case MAT_FORM_EXPLICIT_TRANSPOSE:
524     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
525     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
526     A->form_explicit_transpose = flg;
527     break;
528   default:
529     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
530     break;
531   }
532   PetscFunctionReturn(PETSC_SUCCESS);
533 }
534 
535 /* Depending on reuse, either build a new mat, or use the existing mat */
536 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
537 {
538   Mat_SeqAIJ *aseq;
539 
540   PetscFunctionBegin;
541   PetscCall(PetscKokkosInitializeCheck());
542   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
543     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
544   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
545     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
546   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
547     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
548     PetscCall(PetscFree(A->defaultvectype));
549     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
550     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
551     PetscCall(MatSetOps_SeqAIJKokkos(A));
552     aseq = static_cast<Mat_SeqAIJ *>(A->data);
553     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
554       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
555       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
556     }
557   }
558   PetscFunctionReturn(PETSC_SUCCESS);
559 }
560 
561 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
562    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
563  */
564 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
565 {
566   Mat_SeqAIJ       *bseq;
567   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
568   Mat               mat;
569 
570   PetscFunctionBegin;
571   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
572   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
573   mat = *B;
574   if (A->assembled) {
575     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
576     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
577     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
578     /* Now copy values to B if needed */
579     if (dupOption == MAT_COPY_VALUES) {
580       if (akok->a_dual.need_sync_device()) {
581         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
582         bkok->a_dual.modify_host();
583       } else { /* If device has the latest data, we only copy data on device */
584         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
585         bkok->a_dual.modify_device();
586       }
587     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
588       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
589       bkok->a_dual.modify_host();
590     }
591     mat->spptr = bkok;
592   }
593 
594   PetscCall(PetscFree(mat->defaultvectype));
595   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
596   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
597   PetscCall(MatSetOps_SeqAIJKokkos(mat));
598   PetscFunctionReturn(PETSC_SUCCESS);
599 }
600 
601 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
602 {
603   Mat               At;
604   KokkosCsrMatrix   internT;
605   Mat_SeqAIJKokkos *atkok, *bkok;
606 
607   PetscFunctionBegin;
608   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
609   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
610   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
611     /* Deep copy internT, as we want to isolate the internal transpose */
612     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
613     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
614     if (reuse == MAT_INITIAL_MATRIX) *B = At;
615     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
616   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
617     if ((*B)->assembled) {
618       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
619       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
620       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
621     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
622       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
623       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
624       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
625       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
626       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
627     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
628   }
629   PetscFunctionReturn(PETSC_SUCCESS);
630 }
631 
632 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
633 {
634   Mat_SeqAIJKokkos *aijkok;
635 
636   PetscFunctionBegin;
637   if (A->factortype == MAT_FACTOR_NONE) {
638     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
639     delete aijkok;
640   } else {
641     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
642   }
643   A->spptr = NULL;
644   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
645   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
646   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
647   PetscCall(MatDestroy_SeqAIJ(A));
648   PetscFunctionReturn(PETSC_SUCCESS);
649 }
650 
651 /*MC
652    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
653 
654    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
655 
656    Options Database Key:
657 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
658 
659   Level: beginner
660 
661 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
662 M*/
663 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
664 {
665   PetscFunctionBegin;
666   PetscCall(PetscKokkosInitializeCheck());
667   PetscCall(MatCreate_SeqAIJ(A));
668   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
669   PetscFunctionReturn(PETSC_SUCCESS);
670 }
671 
672 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
673 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
674 {
675   Mat_SeqAIJ         *a, *b;
676   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
677   MatScalarKokkosView aa, ba, ca;
678   MatRowMapKokkosView ai, bi, ci;
679   MatColIdxKokkosView aj, bj, cj;
680   PetscInt            m, n, nnz, aN;
681 
682   PetscFunctionBegin;
683   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
684   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
685   PetscAssertPointer(C, 4);
686   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
687   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
688   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
689   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
690 
691   PetscCall(MatSeqAIJKokkosSyncDevice(A));
692   PetscCall(MatSeqAIJKokkosSyncDevice(B));
693   a    = static_cast<Mat_SeqAIJ *>(A->data);
694   b    = static_cast<Mat_SeqAIJ *>(B->data);
695   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
696   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
697   aa   = akok->a_dual.view_device();
698   ai   = akok->i_dual.view_device();
699   ba   = bkok->a_dual.view_device();
700   bi   = bkok->i_dual.view_device();
701   m    = A->rmap->n; /* M, N and nnz of C */
702   n    = A->cmap->n + B->cmap->n;
703   nnz  = a->nz + b->nz;
704   aN   = A->cmap->n; /* N of A */
705   if (reuse == MAT_INITIAL_MATRIX) {
706     aj           = akok->j_dual.view_device();
707     bj           = bkok->j_dual.view_device();
708     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
709     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
710     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
711     ca           = ca_dual.view_device();
712     ci           = ci_dual.view_device();
713     cj           = cj_dual.view_device();
714 
715     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
716     Kokkos::parallel_for(
717       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
718         PetscInt i       = t.league_rank(); /* row i */
719         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
720 
721         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
722                                                    ci(i) = coffset;
723                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
724         });
725 
726         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
727           if (k < alen) {
728             ca(coffset + k) = aa(ai(i) + k);
729             cj(coffset + k) = aj(ai(i) + k);
730           } else {
731             ca(coffset + k) = ba(bi(i) + k - alen);
732             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
733           }
734         });
735       });
736     ca_dual.modify_device();
737     ci_dual.modify_device();
738     cj_dual.modify_device();
739     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
740     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
741   } else if (reuse == MAT_REUSE_MATRIX) {
742     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
743     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
744     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
745     ca   = ckok->a_dual.view_device();
746     ci   = ckok->i_dual.view_device();
747 
748     Kokkos::parallel_for(
749       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
750         PetscInt i    = t.league_rank(); /* row i */
751         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
752         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
753           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
754           else ca(ci(i) + k) = ba(bi(i) + k - alen);
755         });
756       });
757     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
758   }
759   PetscFunctionReturn(PETSC_SUCCESS);
760 }
761 
762 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
763 {
764   PetscFunctionBegin;
765   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
766   PetscFunctionReturn(PETSC_SUCCESS);
767 }
768 
769 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
770 {
771   Mat_Product                 *product = C->product;
772   Mat                          A, B;
773   bool                         transA, transB; /* use bool, since KK needs this type */
774   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
775   Mat_SeqAIJ                  *c;
776   MatProductData_SeqAIJKokkos *pdata;
777   KokkosCsrMatrix              csrmatA, csrmatB;
778 
779   PetscFunctionBegin;
780   MatCheckProduct(C, 1);
781   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
782   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
783 
784   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
785   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
786   // we still do numeric.
787   if (pdata->reusesym) { // numeric reuses results from symbolic
788     pdata->reusesym = PETSC_FALSE;
789     PetscFunctionReturn(PETSC_SUCCESS);
790   }
791 
792   switch (product->type) {
793   case MATPRODUCT_AB:
794     transA = false;
795     transB = false;
796     break;
797   case MATPRODUCT_AtB:
798     transA = true;
799     transB = false;
800     break;
801   case MATPRODUCT_ABt:
802     transA = false;
803     transB = true;
804     break;
805   default:
806     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
807   }
808 
809   A = product->A;
810   B = product->B;
811   PetscCall(MatSeqAIJKokkosSyncDevice(A));
812   PetscCall(MatSeqAIJKokkosSyncDevice(B));
813   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
814   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
815   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
816 
817   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
818 
819   csrmatA = akok->csrmat;
820   csrmatB = bkok->csrmat;
821 
822   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
823   if (transA) {
824     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
825     transA = false;
826   }
827 
828   if (transB) {
829     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
830     transB = false;
831   }
832   PetscCall(PetscLogGpuTimeBegin());
833   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
834 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
835   auto spgemmHandle = pdata->kh.get_spgemm_handle();
836   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
837 #endif
838 
839   PetscCall(PetscLogGpuTimeEnd());
840   PetscCall(MatSeqAIJKokkosModifyDevice(C));
841   /* shorter version of MatAssemblyEnd_SeqAIJ */
842   c = (Mat_SeqAIJ *)C->data;
843   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
844   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
845   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
846   c->reallocs         = 0;
847   C->info.mallocs     = 0;
848   C->info.nz_unneeded = 0;
849   C->assembled = C->was_assembled = PETSC_TRUE;
850   C->num_ass++;
851   PetscFunctionReturn(PETSC_SUCCESS);
852 }
853 
854 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
855 {
856   Mat_Product                 *product = C->product;
857   MatProductType               ptype;
858   Mat                          A, B;
859   bool                         transA, transB;
860   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
861   MatProductData_SeqAIJKokkos *pdata;
862   MPI_Comm                     comm;
863   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
864 
865   PetscFunctionBegin;
866   MatCheckProduct(C, 1);
867   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
868   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
869   A = product->A;
870   B = product->B;
871   PetscCall(MatSeqAIJKokkosSyncDevice(A));
872   PetscCall(MatSeqAIJKokkosSyncDevice(B));
873   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
874   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
875   csrmatA = akok->csrmat;
876   csrmatB = bkok->csrmat;
877 
878   ptype = product->type;
879   // Take advantage of the symmetry if true
880   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
881     ptype                                          = MATPRODUCT_AB;
882     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
883   }
884   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
885     ptype                                          = MATPRODUCT_AB;
886     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
887   }
888 
889   switch (ptype) {
890   case MATPRODUCT_AB:
891     transA = false;
892     transB = false;
893     break;
894   case MATPRODUCT_AtB:
895     transA = true;
896     transB = false;
897     break;
898   case MATPRODUCT_ABt:
899     transA = false;
900     transB = true;
901     break;
902   default:
903     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
904   }
905   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
906   pdata->reusesym = product->api_user;
907 
908   /* TODO: add command line options to select spgemm algorithms */
909   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
910 
911   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
912 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
913   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
914   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
915   #endif
916 #endif
917   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
918 
919   PetscCall(PetscLogGpuTimeBegin());
920   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
921   if (transA) {
922     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
923     transA = false;
924   }
925 
926   if (transB) {
927     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
928     transB = false;
929   }
930 
931   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
932   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
933     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
934     calling new Mat_SeqAIJKokkos().
935     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
936   */
937   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
938 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
939   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
940   auto spgemmHandle = pdata->kh.get_spgemm_handle();
941   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
942 #endif
943   PetscCall(PetscLogGpuTimeEnd());
944 
945   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
946   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
947   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
948   PetscFunctionReturn(PETSC_SUCCESS);
949 }
950 
951 /* handles sparse matrix matrix ops */
952 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
953 {
954   Mat_Product *product = mat->product;
955   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
956 
957   PetscFunctionBegin;
958   MatCheckProduct(mat, 1);
959   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
960   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
961   if (Biskok && Ciskok) {
962     switch (product->type) {
963     case MATPRODUCT_AB:
964     case MATPRODUCT_AtB:
965     case MATPRODUCT_ABt:
966       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
967       break;
968     case MATPRODUCT_PtAP:
969     case MATPRODUCT_RARt:
970     case MATPRODUCT_ABC:
971       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
972       break;
973     default:
974       break;
975     }
976   } else { /* fallback for AIJ */
977     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
978   }
979   PetscFunctionReturn(PETSC_SUCCESS);
980 }
981 
982 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
983 {
984   Mat_SeqAIJKokkos *aijkok;
985 
986   PetscFunctionBegin;
987   PetscCall(PetscLogGpuTimeBegin());
988   PetscCall(MatSeqAIJKokkosSyncDevice(A));
989   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
990   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
991   PetscCall(MatSeqAIJKokkosModifyDevice(A));
992   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
993   PetscCall(PetscLogGpuTimeEnd());
994   PetscFunctionReturn(PETSC_SUCCESS);
995 }
996 
997 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
998 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
999 {
1000   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1001 
1002   PetscFunctionBegin;
1003   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1004     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1005 
1006     PetscCall(PetscLogGpuTimeBegin());
1007     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1008     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1009     const auto &Aa     = aijkok->a_dual.view_device();
1010     const auto &Adiag  = aijkok->diag_dual.view_device();
1011     PetscCallCXX(Kokkos::parallel_for(
1012       n, KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1013     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1014     PetscCall(PetscLogGpuFlops(n));
1015     PetscCall(PetscLogGpuTimeEnd());
1016   } else { // need reassembly, very slow!
1017     PetscCall(MatShift_Basic(A, a));
1018   }
1019   PetscFunctionReturn(PETSC_SUCCESS);
1020 }
1021 
1022 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1023 {
1024   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1025 
1026   PetscFunctionBegin;
1027   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1028     ConstPetscScalarKokkosView dv;
1029     PetscInt                   n, nv;
1030 
1031     PetscCall(PetscLogGpuTimeBegin());
1032     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1033     PetscCall(VecGetKokkosView(D, &dv));
1034     PetscCall(VecGetLocalSize(D, &nv));
1035     n = PetscMin(Y->rmap->n, Y->cmap->n);
1036     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1037 
1038     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1039     const auto &Aa     = aijkok->a_dual.view_device();
1040     const auto &Adiag  = aijkok->diag_dual.view_device();
1041     PetscCallCXX(Kokkos::parallel_for(
1042       n, KOKKOS_LAMBDA(const PetscInt i) {
1043         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1044         else Aa(Adiag(i)) += dv(i);
1045       }));
1046     PetscCall(VecRestoreKokkosView(D, &dv));
1047     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1048     PetscCall(PetscLogGpuFlops(n));
1049     PetscCall(PetscLogGpuTimeEnd());
1050   } else { // need reassembly, very slow!
1051     PetscCall(MatDiagonalSet_Default(Y, D, is));
1052   }
1053   PetscFunctionReturn(PETSC_SUCCESS);
1054 }
1055 
1056 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1057 {
1058   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1059   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1060   ConstPetscScalarKokkosView lv, rv;
1061 
1062   PetscFunctionBegin;
1063   PetscCall(PetscLogGpuTimeBegin());
1064   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1065   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1066   const auto &Aa     = aijkok->a_dual.view_device();
1067   const auto &Ai     = aijkok->i_dual.view_device();
1068   const auto &Aj     = aijkok->j_dual.view_device();
1069   if (ll) {
1070     PetscCall(VecGetLocalSize(ll, &m));
1071     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1072     PetscCall(VecGetKokkosView(ll, &lv));
1073     PetscCallCXX(Kokkos::parallel_for( // for each row
1074       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1075         PetscInt i   = t.league_rank(); // row i
1076         PetscInt len = Ai(i + 1) - Ai(i);
1077         // scale entries on the row
1078         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1079       }));
1080     PetscCall(VecRestoreKokkosView(ll, &lv));
1081     PetscCall(PetscLogGpuFlops(nz));
1082   }
1083   if (rr) {
1084     PetscCall(VecGetLocalSize(rr, &n));
1085     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1086     PetscCall(VecGetKokkosView(rr, &rv));
1087     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1088       nz, KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1089     PetscCall(VecRestoreKokkosView(rr, &lv));
1090     PetscCall(PetscLogGpuFlops(nz));
1091   }
1092   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1093   PetscCall(PetscLogGpuTimeEnd());
1094   PetscFunctionReturn(PETSC_SUCCESS);
1095 }
1096 
1097 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1098 {
1099   Mat_SeqAIJKokkos *aijkok;
1100 
1101   PetscFunctionBegin;
1102   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1103   if (aijkok) { /* Only zero the device if data is already there */
1104     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
1105     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1106   } else { /* Might be preallocated but not assembled */
1107     PetscCall(MatZeroEntries_SeqAIJ(A));
1108   }
1109   PetscFunctionReturn(PETSC_SUCCESS);
1110 }
1111 
1112 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1113 {
1114   Mat_SeqAIJKokkos     *aijkok;
1115   PetscInt              n;
1116   PetscScalarKokkosView xv;
1117 
1118   PetscFunctionBegin;
1119   PetscCall(VecGetLocalSize(x, &n));
1120   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1121   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1122 
1123   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1124   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1125 
1126   const auto &Aa    = aijkok->a_dual.view_device();
1127   const auto &Ai    = aijkok->i_dual.view_device();
1128   const auto &Adiag = aijkok->diag_dual.view_device();
1129 
1130   PetscCall(VecGetKokkosViewWrite(x, &xv));
1131   Kokkos::parallel_for(
1132     n, KOKKOS_LAMBDA(const PetscInt i) {
1133       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1134       else xv(i) = 0;
1135     });
1136   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1137   PetscFunctionReturn(PETSC_SUCCESS);
1138 }
1139 
1140 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1141 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1142 {
1143   Mat_SeqAIJKokkos *aijkok;
1144 
1145   PetscFunctionBegin;
1146   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1147   PetscAssertPointer(kv, 2);
1148   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1149   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1150   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1151   *kv    = aijkok->a_dual.view_device();
1152   PetscFunctionReturn(PETSC_SUCCESS);
1153 }
1154 
1155 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1156 {
1157   PetscFunctionBegin;
1158   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1159   PetscAssertPointer(kv, 2);
1160   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1161   PetscFunctionReturn(PETSC_SUCCESS);
1162 }
1163 
1164 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1165 {
1166   Mat_SeqAIJKokkos *aijkok;
1167 
1168   PetscFunctionBegin;
1169   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1170   PetscAssertPointer(kv, 2);
1171   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1172   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1173   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1174   *kv    = aijkok->a_dual.view_device();
1175   PetscFunctionReturn(PETSC_SUCCESS);
1176 }
1177 
1178 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1179 {
1180   PetscFunctionBegin;
1181   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1182   PetscAssertPointer(kv, 2);
1183   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1184   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1185   PetscFunctionReturn(PETSC_SUCCESS);
1186 }
1187 
1188 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1189 {
1190   Mat_SeqAIJKokkos *aijkok;
1191 
1192   PetscFunctionBegin;
1193   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1194   PetscAssertPointer(kv, 2);
1195   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1196   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1197   *kv    = aijkok->a_dual.view_device();
1198   PetscFunctionReturn(PETSC_SUCCESS);
1199 }
1200 
1201 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1202 {
1203   PetscFunctionBegin;
1204   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1205   PetscAssertPointer(kv, 2);
1206   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1207   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1208   PetscFunctionReturn(PETSC_SUCCESS);
1209 }
1210 
1211 /* Computes Y += alpha X */
1212 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1213 {
1214   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1215   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1216   ConstMatScalarKokkosView Xa;
1217   MatScalarKokkosView      Ya;
1218 
1219   PetscFunctionBegin;
1220   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1221   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1222   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1223   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1224   PetscCall(PetscLogGpuTimeBegin());
1225 
1226   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1227     PetscBool e;
1228     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1229     if (e) {
1230       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1231       if (e) pattern = SAME_NONZERO_PATTERN;
1232     }
1233   }
1234 
1235   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1236     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1237     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1238     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1239   */
1240   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1241   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1242   Xa   = xkok->a_dual.view_device();
1243   Ya   = ykok->a_dual.view_device();
1244 
1245   if (pattern == SAME_NONZERO_PATTERN) {
1246     KokkosBlas::axpy(alpha, Xa, Ya);
1247     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1248   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1249     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1250     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1251 
1252     Kokkos::parallel_for(
1253       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1254         PetscInt i = t.league_rank(); // row i
1255         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1256           // Only one thread works in a team
1257           PetscInt p, q = Yi(i);
1258           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1259             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1260             if (Xj(p) == Yj(q)) {                        // Found it
1261               Ya(q) += alpha * Xa(p);
1262               q++;
1263             } else {
1264             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1265             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1266 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1267               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1268 #else
1269               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1270 #endif
1271             }
1272           }
1273         });
1274       });
1275     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1276   } else { // different nonzero patterns
1277     Mat             Z;
1278     KokkosCsrMatrix zcsr;
1279     KernelHandle    kh;
1280     kh.create_spadd_handle(true); // X, Y are sorted
1281     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1282     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1283     zkok = new Mat_SeqAIJKokkos(zcsr);
1284     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1285     PetscCall(MatHeaderReplace(Y, &Z));
1286     kh.destroy_spadd_handle();
1287   }
1288   PetscCall(PetscLogGpuTimeEnd());
1289   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1290   PetscFunctionReturn(PETSC_SUCCESS);
1291 }
1292 
1293 struct MatCOOStruct_SeqAIJKokkos {
1294   PetscCount           n;
1295   PetscCount           Atot;
1296   PetscInt             nz;
1297   PetscCountKokkosView jmap;
1298   PetscCountKokkosView perm;
1299 
1300   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1301   {
1302     nz   = coo_h->nz;
1303     n    = coo_h->n;
1304     Atot = coo_h->Atot;
1305     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1306     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1307   }
1308 };
1309 
1310 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
1311 {
1312   PetscFunctionBegin;
1313   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
1314   PetscFunctionReturn(PETSC_SUCCESS);
1315 }
1316 
1317 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1318 {
1319   Mat_SeqAIJKokkos          *akok;
1320   Mat_SeqAIJ                *aseq;
1321   PetscContainer             container_h, container_d;
1322   MatCOOStruct_SeqAIJ       *coo_h;
1323   MatCOOStruct_SeqAIJKokkos *coo_d;
1324 
1325   PetscFunctionBegin;
1326   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1327   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1328   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1329   delete akok;
1330   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1331   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1332 
1333   // Copy the COO struct to device
1334   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1335   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1336   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1337 
1338   // Put the COO struct in a container and then attach that to the matrix
1339   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
1340   PetscCall(PetscContainerSetPointer(container_d, coo_d));
1341   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos));
1342   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
1343   PetscCall(PetscContainerDestroy(&container_d));
1344   PetscFunctionReturn(PETSC_SUCCESS);
1345 }
1346 
1347 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1348 {
1349   MatScalarKokkosView        Aa;
1350   ConstMatScalarKokkosView   kv;
1351   PetscMemType               memtype;
1352   PetscContainer             container;
1353   MatCOOStruct_SeqAIJKokkos *coo;
1354 
1355   PetscFunctionBegin;
1356   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1357   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1358 
1359   const auto &n    = coo->n;
1360   const auto &Annz = coo->nz;
1361   const auto &jmap = coo->jmap;
1362   const auto &perm = coo->perm;
1363 
1364   PetscCall(PetscGetMemType(v, &memtype));
1365   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1366     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1367   } else {
1368     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1369   }
1370 
1371   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1372   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1373 
1374   PetscCall(PetscLogGpuTimeBegin());
1375   Kokkos::parallel_for(
1376     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1377       PetscScalar sum = 0.0;
1378       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1379       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1380     });
1381   PetscCall(PetscLogGpuTimeEnd());
1382 
1383   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1384   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1385   PetscFunctionReturn(PETSC_SUCCESS);
1386 }
1387 
1388 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1389 {
1390   PetscFunctionBegin;
1391   PetscCall(MatSeqAIJKokkosSyncHost(A));
1392   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1393   B->offloadmask = PETSC_OFFLOAD_CPU;
1394   PetscFunctionReturn(PETSC_SUCCESS);
1395 }
1396 
1397 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1398 {
1399   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1400 
1401   PetscFunctionBegin;
1402   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1403   A->boundtocpu  = PETSC_FALSE;
1404 
1405   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1406   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1407   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1408   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1409   A->ops->scale                     = MatScale_SeqAIJKokkos;
1410   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1411   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1412   A->ops->mult                      = MatMult_SeqAIJKokkos;
1413   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1414   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1415   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1416   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1417   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1418   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1419   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1420   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1421   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1422   A->ops->shift                     = MatShift_SeqAIJKokkos;
1423   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1424   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1425   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1426   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1427   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1428   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1429   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1430   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1431   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1432 
1433   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1434   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1435   PetscFunctionReturn(PETSC_SUCCESS);
1436 }
1437 
1438 /*
1439    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1440 
1441   Input Parameters:
1442 +  A       - the MATSEQAIJKOKKOS matrix
1443 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1444 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1445 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1446 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1447 
1448   Output Parameter:
1449 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1450 */
1451 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1452 {
1453   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1454   PetscInt          nblocks = bs.extent(0) - 1;
1455 
1456   PetscFunctionBegin;
1457   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1458 
1459   // Pull out the diagonal blocks of the matrix and then invert the blocks
1460   auto Aa    = akok->a_dual.view_device();
1461   auto Ai    = akok->i_dual.view_device();
1462   auto Aj    = akok->j_dual.view_device();
1463   auto Adiag = akok->diag_dual.view_device();
1464   // TODO: how to tune the team size?
1465 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
1466   auto ts = Kokkos::AUTO();
1467 #else
1468   auto ts         = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1469 #endif
1470   PetscCallCXX(Kokkos::parallel_for(
1471     Kokkos::TeamPolicy<>(nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1472       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1473       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1474       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1475       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1476       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1477 
1478       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1479         PetscInt i = rstart + r;                                                            // i-th row in A
1480 
1481         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1482           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1483 
1484           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1485             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1486               B(r, c) = 0.0;
1487             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1488               B(r, c) = Aa(first + c);
1489             } else { // this entry does not show up in the CSR
1490               B(r, c) = 0.0;
1491             }
1492           }
1493         } else { // rare case that the diagonal does not exist
1494           const PetscInt begin = Ai(i);
1495           const PetscInt end   = Ai(i + 1);
1496           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1497           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1498             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1499             else if (Aj(j) >= rstart + m) break;
1500           }
1501         }
1502       });
1503 
1504       // LU-decompose B (w/o pivoting) and then invert B
1505       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1506       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1507     }));
1508   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1509   PetscFunctionReturn(PETSC_SUCCESS);
1510 }
1511 
1512 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1513 {
1514   Mat_SeqAIJ *aseq;
1515   PetscInt    i, m, n;
1516   auto       &exec = PetscGetKokkosExecutionSpace();
1517 
1518   PetscFunctionBegin;
1519   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1520 
1521   m = akok->nrows();
1522   n = akok->ncols();
1523   PetscCall(MatSetSizes(A, m, n, m, n));
1524   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1525 
1526   /* Set up data structures of A as a MATSEQAIJ */
1527   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1528   aseq = (Mat_SeqAIJ *)(A)->data;
1529 
1530   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1531   PetscCallCXX(akok->j_dual.sync_host(exec));
1532   PetscCallCXX(exec.fence());
1533 
1534   aseq->i            = akok->i_host_data();
1535   aseq->j            = akok->j_host_data();
1536   aseq->a            = akok->a_host_data();
1537   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1538   aseq->singlemalloc = PETSC_FALSE;
1539   aseq->free_a       = PETSC_FALSE;
1540   aseq->free_ij      = PETSC_FALSE;
1541   aseq->nz           = akok->nnz();
1542   aseq->maxnz        = aseq->nz;
1543 
1544   PetscCall(PetscMalloc1(m, &aseq->imax));
1545   PetscCall(PetscMalloc1(m, &aseq->ilen));
1546   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1547 
1548   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1549   akok->nonzerostate = A->nonzerostate;
1550   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1551   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1552   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1553   PetscFunctionReturn(PETSC_SUCCESS);
1554 }
1555 
1556 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1557 {
1558   PetscFunctionBegin;
1559   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1560   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1561   PetscFunctionReturn(PETSC_SUCCESS);
1562 }
1563 
1564 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1565 {
1566   Mat_SeqAIJKokkos *akok;
1567   PetscFunctionBegin;
1568   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1569   PetscCall(MatCreate(comm, A));
1570   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1571   PetscFunctionReturn(PETSC_SUCCESS);
1572 }
1573 
1574 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1575 
1576    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1577  */
1578 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1579 {
1580   PetscFunctionBegin;
1581   PetscCall(MatCreate(comm, A));
1582   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1583   PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585 
1586 /*@C
1587   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1588   (the default parallel PETSc format). This matrix will ultimately be handled by
1589   Kokkos for calculations.
1590 
1591   Collective
1592 
1593   Input Parameters:
1594 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1595 . m    - number of rows
1596 . n    - number of columns
1597 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1598 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1599 
1600   Output Parameter:
1601 . A - the matrix
1602 
1603   Level: intermediate
1604 
1605   Notes:
1606   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1607   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1608   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1609 
1610   The AIJ format, also called
1611   compressed row storage, is fully compatible with standard Fortran
1612   storage.  That is, the stored row and column indices can begin at
1613   either one (as in Fortran) or zero.
1614 
1615   Specify the preallocated storage with either `nz` or `nnz` (not both).
1616   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1617   allocation.
1618 
1619 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1620 @*/
1621 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1622 {
1623   PetscFunctionBegin;
1624   PetscCall(PetscKokkosInitializeCheck());
1625   PetscCall(MatCreate(comm, A));
1626   PetscCall(MatSetSizes(*A, m, n, m, n));
1627   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1628   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1629   PetscFunctionReturn(PETSC_SUCCESS);
1630 }
1631 
1632 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1633 {
1634   PetscFunctionBegin;
1635   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1636   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1637   PetscFunctionReturn(PETSC_SUCCESS);
1638 }
1639 
1640 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1641 {
1642   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1643 
1644   PetscFunctionBegin;
1645   if (!factors->sptrsv_symbolic_completed) {
1646     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1647     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1648     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1649   }
1650   PetscFunctionReturn(PETSC_SUCCESS);
1651 }
1652 
1653 /* Check if we need to update factors etc for transpose solve */
1654 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1655 {
1656   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1657   MatColIdxType               n       = A->rmap->n;
1658 
1659   PetscFunctionBegin;
1660   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1661     /* Update L^T and do sptrsv symbolic */
1662     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1663     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1664     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1665 
1666     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1667                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1668 
1669     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1670       We have to sort the indices, until KK provides finer control options.
1671     */
1672     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1673 
1674     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1675 
1676     /* Update U^T and do sptrsv symbolic */
1677     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1678     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1679     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1680 
1681     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1682                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1683 
1684     /* Sort indices. See comments above */
1685     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1686 
1687     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1688     factors->transpose_updated = PETSC_TRUE;
1689   }
1690   PetscFunctionReturn(PETSC_SUCCESS);
1691 }
1692 
1693 /* Solve Ax = b, with A = LU */
1694 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1695 {
1696   ConstPetscScalarKokkosView  bv;
1697   PetscScalarKokkosView       xv;
1698   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1699 
1700   PetscFunctionBegin;
1701   PetscCall(PetscLogGpuTimeBegin());
1702   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1703   PetscCall(VecGetKokkosView(b, &bv));
1704   PetscCall(VecGetKokkosViewWrite(x, &xv));
1705   /* Solve L tmpv = b */
1706   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1707   /* Solve Ux = tmpv */
1708   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1709   PetscCall(VecRestoreKokkosView(b, &bv));
1710   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1711   PetscCall(PetscLogGpuTimeEnd());
1712   PetscFunctionReturn(PETSC_SUCCESS);
1713 }
1714 
1715 /* Solve A^T x = b, where A^T = U^T L^T */
1716 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1717 {
1718   ConstPetscScalarKokkosView  bv;
1719   PetscScalarKokkosView       xv;
1720   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1721 
1722   PetscFunctionBegin;
1723   PetscCall(PetscLogGpuTimeBegin());
1724   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1725   PetscCall(VecGetKokkosView(b, &bv));
1726   PetscCall(VecGetKokkosViewWrite(x, &xv));
1727   /* Solve U^T tmpv = b */
1728   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1729 
1730   /* Solve L^T x = tmpv */
1731   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1732   PetscCall(VecRestoreKokkosView(b, &bv));
1733   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1734   PetscCall(PetscLogGpuTimeEnd());
1735   PetscFunctionReturn(PETSC_SUCCESS);
1736 }
1737 
1738 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1739 {
1740   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1741   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1742   PetscInt                    fill_lev = info->levels;
1743 
1744   PetscFunctionBegin;
1745   PetscCall(PetscLogGpuTimeBegin());
1746   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1747 
1748   auto a_d = aijkok->a_dual.view_device();
1749   auto i_d = aijkok->i_dual.view_device();
1750   auto j_d = aijkok->j_dual.view_device();
1751 
1752   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1753 
1754   B->assembled              = PETSC_TRUE;
1755   B->preallocated           = PETSC_TRUE;
1756   B->ops->solve             = MatSolve_SeqAIJKokkos;
1757   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1758   B->ops->matsolve          = NULL;
1759   B->ops->matsolvetranspose = NULL;
1760   B->offloadmask            = PETSC_OFFLOAD_GPU;
1761 
1762   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1763   factors->transpose_updated         = PETSC_FALSE;
1764   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1765   /* TODO: log flops, but how to know that? */
1766   PetscCall(PetscLogGpuTimeEnd());
1767   PetscFunctionReturn(PETSC_SUCCESS);
1768 }
1769 
1770 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1771 {
1772   Mat_SeqAIJKokkos           *aijkok;
1773   Mat_SeqAIJ                 *b;
1774   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1775   PetscInt                    fill_lev = info->levels;
1776   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1777   PetscInt                    n        = A->rmap->n;
1778 
1779   PetscFunctionBegin;
1780   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1781   /* Rebuild factors */
1782   if (factors) {
1783     factors->Destroy();
1784   } /* Destroy the old if it exists */
1785   else {
1786     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1787   }
1788 
1789   /* Create a spiluk handle and then do symbolic factorization */
1790   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1791   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1792 
1793   auto spiluk_handle = factors->kh.get_spiluk_handle();
1794 
1795   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1796   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1797   Kokkos::realloc(factors->iU_d, n + 1);
1798   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1799 
1800   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1801   auto i_d = aijkok->i_dual.view_device();
1802   auto j_d = aijkok->j_dual.view_device();
1803   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1804   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1805 
1806   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1807   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1808   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1809   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1810 
1811   /* TODO: add options to select sptrsv algorithms */
1812   /* Create sptrsv handles for L, U and their transpose */
1813 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1814   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1815 #else
1816   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1817 #endif
1818 
1819   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1820   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1821   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1822   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1823 
1824   /* Fill fields of the factor matrix B */
1825   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1826   b     = (Mat_SeqAIJ *)B->data;
1827   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1828   B->info.fill_ratio_given  = info->fill;
1829   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1830 
1831   B->offloadmask          = PETSC_OFFLOAD_GPU;
1832   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1833   PetscFunctionReturn(PETSC_SUCCESS);
1834 }
1835 
1836 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1837 {
1838   PetscFunctionBegin;
1839   *type = MATSOLVERKOKKOS;
1840   PetscFunctionReturn(PETSC_SUCCESS);
1841 }
1842 
1843 /*MC
1844   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1845   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1846 
1847   Level: beginner
1848 
1849 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1850 M*/
1851 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1852 {
1853   PetscInt n = A->rmap->n;
1854 
1855   PetscFunctionBegin;
1856   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1857   PetscCall(MatSetSizes(*B, n, n, n, n));
1858   (*B)->factortype = ftype;
1859   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1860   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1861 
1862   if (ftype == MAT_FACTOR_LU) {
1863     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1864     (*B)->canuseordering        = PETSC_TRUE;
1865     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1866   } else if (ftype == MAT_FACTOR_ILU) {
1867     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1868     (*B)->canuseordering         = PETSC_FALSE;
1869     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1870   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1871 
1872   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1873   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1874   PetscFunctionReturn(PETSC_SUCCESS);
1875 }
1876 
1877 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1878 {
1879   PetscFunctionBegin;
1880   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1881   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1882   PetscFunctionReturn(PETSC_SUCCESS);
1883 }
1884 
1885 /* Utility to print out a KokkosCsrMatrix for debugging */
1886 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1887 {
1888   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1889   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1890   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1891   const PetscInt    *i  = iv.data();
1892   const PetscInt    *j  = jv.data();
1893   const PetscScalar *a  = av.data();
1894   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1895 
1896   PetscFunctionBegin;
1897   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1898   for (PetscInt k = 0; k < m; k++) {
1899     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1900     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1901     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1902   }
1903   PetscFunctionReturn(PETSC_SUCCESS);
1904 }
1905