xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision a4e35b1925eceef64945ea472b84f2bf06a67b5e)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 #include <KokkosBatched_LU_Decl.hpp>
17 #include <KokkosBatched_InverseLU_Decl.hpp>
18 
19 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
20 
21 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
22   #include <KokkosSparse_Utils.hpp>
23 using KokkosSparse::sort_crs_matrix;
24 using KokkosSparse::Impl::transpose_matrix;
25 #else
26   #include <KokkosKernels_Sorting.hpp>
27 using KokkosKernels::sort_crs_matrix;
28 using KokkosKernels::Impl::transpose_matrix;
29 #endif
30 
31 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
32 
33 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
34    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
35    In the latter case, it is important to set a_dual's sync state correctly.
36  */
37 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
38 {
39   Mat_SeqAIJ       *aijseq;
40   Mat_SeqAIJKokkos *aijkok;
41 
42   PetscFunctionBegin;
43   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
44   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
45 
46   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
47   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
48 
49   /* If aijkok does not exist, we just copy i, j to device.
50      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
51      In both cases, we build a new aijkok structure.
52   */
53   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
54     delete aijkok;
55     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
56     A->spptr = aijkok;
57   }
58 
59   PetscFunctionReturn(PETSC_SUCCESS);
60 }
61 
62 /* Sync CSR data to device if not yet */
63 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
64 {
65   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
66 
67   PetscFunctionBegin;
68   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
69   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
70   if (aijkok->a_dual.need_sync_device()) {
71     aijkok->a_dual.sync_device();
72     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
73     aijkok->hermitian_updated = PETSC_FALSE;
74   }
75   PetscFunctionReturn(PETSC_SUCCESS);
76 }
77 
78 /* Mark the CSR data on device as modified */
79 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
80 {
81   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
82 
83   PetscFunctionBegin;
84   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
85   aijkok->a_dual.clear_sync_state();
86   aijkok->a_dual.modify_device();
87   aijkok->transpose_updated = PETSC_FALSE;
88   aijkok->hermitian_updated = PETSC_FALSE;
89   PetscCall(MatSeqAIJInvalidateDiagonal(A));
90   PetscCall(PetscObjectStateIncrease((PetscObject)A));
91   PetscFunctionReturn(PETSC_SUCCESS);
92 }
93 
94 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
95 {
96   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
97 
98   PetscFunctionBegin;
99   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
100   /* We do not expect one needs factors on host  */
101   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
102   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
103   aijkok->a_dual.sync_host();
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 
107 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
108 {
109   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
110 
111   PetscFunctionBegin;
112   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
113     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
114     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
115     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
116   */
117   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
118     aijkok->a_dual.sync_host();
119     *array = aijkok->a_dual.view_host().data();
120   } else { /* Happens when calling MatSetValues on a newly created matrix */
121     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
122   }
123   PetscFunctionReturn(PETSC_SUCCESS);
124 }
125 
126 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
127 {
128   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
129 
130   PetscFunctionBegin;
131   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
132   PetscFunctionReturn(PETSC_SUCCESS);
133 }
134 
135 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
136 {
137   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
138 
139   PetscFunctionBegin;
140   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
141     aijkok->a_dual.sync_host();
142     *array = aijkok->a_dual.view_host().data();
143   } else {
144     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
145   }
146   PetscFunctionReturn(PETSC_SUCCESS);
147 }
148 
149 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
150 {
151   PetscFunctionBegin;
152   *array = NULL;
153   PetscFunctionReturn(PETSC_SUCCESS);
154 }
155 
156 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
157 {
158   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
159 
160   PetscFunctionBegin;
161   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
162     *array = aijkok->a_dual.view_host().data();
163   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
164     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
165   }
166   PetscFunctionReturn(PETSC_SUCCESS);
167 }
168 
169 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
170 {
171   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
172 
173   PetscFunctionBegin;
174   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
175     aijkok->a_dual.clear_sync_state();
176     aijkok->a_dual.modify_host();
177   }
178   PetscFunctionReturn(PETSC_SUCCESS);
179 }
180 
181 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
182 {
183   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
184 
185   PetscFunctionBegin;
186   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
187 
188   if (i) *i = aijkok->i_device_data();
189   if (j) *j = aijkok->j_device_data();
190   if (a) {
191     aijkok->a_dual.sync_device();
192     *a = aijkok->a_device_data();
193   }
194   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
195   PetscFunctionReturn(PETSC_SUCCESS);
196 }
197 
198 /*
199   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
200 
201   Input Parameter:
202 .  A       - the MATSEQAIJKOKKOS matrix
203 
204   Output Parameters:
205 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
206 -  T_d    - the transpose on device, whose value array is allocated but not initialized
207 */
208 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
209 {
210   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
211   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
212   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
213   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
214   MatRowMapType          *Ti = Ti_h.data();
215   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
216   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
217   PetscInt               *Tj   = Tj_h.data();
218   PetscInt               *perm = perm_h.data();
219   PetscInt               *offset;
220 
221   PetscFunctionBegin;
222   // Populate Ti
223   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
224   Ti++;
225   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
226   Ti--;
227   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
228 
229   // Populate Tj and the permutation array
230   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
231   for (PetscInt i = 0; i < m; i++) {
232     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
233       PetscInt r    = Aj[j];                       // row r of T
234       PetscInt disp = Ti[r] + offset[r];
235 
236       Tj[disp]   = i; // col i of T
237       perm[disp] = j;
238       offset[r]++;
239     }
240   }
241   PetscCall(PetscFree(offset));
242 
243   // Sort each row of T, along with the permutation array
244   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
245 
246   // Output perm and T on device
247   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
248   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
249   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
250   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
251   PetscFunctionReturn(PETSC_SUCCESS);
252 }
253 
254 // Generate the transpose on device and cache it internally
255 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
256 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
257 {
258   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
259   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
260   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
261   KokkosCsrMatrix  &T = akok->csrmatT;
262 
263   PetscFunctionBegin;
264   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
265   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
266 
267   const auto &Aa = akok->a_dual.view_device();
268 
269   if (A->symmetric == PETSC_BOOL3_TRUE) {
270     *csrmatT = akok->csrmat;
271   } else {
272     // See if we already have a cached transpose and its value is up to date
273     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
274       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
275         const auto &perm = akok->transpose_perm; // get the permutation array
276         auto       &Ta   = T.values;
277 
278         PetscCallCXX(Kokkos::parallel_for(
279           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
280       }
281     } else { // Generate T of size n x m for the first time
282       MatRowMapKokkosView perm;
283 
284       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
285       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
286       PetscCallCXX(Kokkos::parallel_for(
287         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
288     }
289     akok->transpose_updated = PETSC_TRUE;
290     *csrmatT                = akok->csrmatT;
291   }
292   PetscFunctionReturn(PETSC_SUCCESS);
293 }
294 
295 // Generate the Hermitian on device and cache it internally
296 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
297 {
298   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
299   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
300   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
301   KokkosCsrMatrix  &T = akok->csrmatH;
302 
303   PetscFunctionBegin;
304   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
305   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
306 
307   const auto &Aa = akok->a_dual.view_device();
308 
309   if (A->hermitian == PETSC_BOOL3_TRUE) {
310     *csrmatH = akok->csrmat;
311   } else {
312     // See if we already have a cached hermitian and its value is up to date
313     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
314       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
315         const auto &perm = akok->transpose_perm; // get the permutation array
316         auto       &Ta   = T.values;
317 
318         PetscCallCXX(Kokkos::parallel_for(
319           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
320       }
321     } else { // Generate T of size n x m for the first time
322       MatRowMapKokkosView perm;
323 
324       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
325       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
326       PetscCallCXX(Kokkos::parallel_for(
327         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
328     }
329     akok->hermitian_updated = PETSC_TRUE;
330     *csrmatH                = akok->csrmatH;
331   }
332   PetscFunctionReturn(PETSC_SUCCESS);
333 }
334 
335 /* y = A x */
336 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
337 {
338   Mat_SeqAIJKokkos          *aijkok;
339   ConstPetscScalarKokkosView xv;
340   PetscScalarKokkosView      yv;
341 
342   PetscFunctionBegin;
343   PetscCall(PetscLogGpuTimeBegin());
344   PetscCall(MatSeqAIJKokkosSyncDevice(A));
345   PetscCall(VecGetKokkosView(xx, &xv));
346   PetscCall(VecGetKokkosViewWrite(yy, &yv));
347   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
348   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
349   PetscCall(VecRestoreKokkosView(xx, &xv));
350   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
351   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
352   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
353   PetscCall(PetscLogGpuTimeEnd());
354   PetscFunctionReturn(PETSC_SUCCESS);
355 }
356 
357 /* y = A^T x */
358 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
359 {
360   Mat_SeqAIJKokkos          *aijkok;
361   const char                *mode;
362   ConstPetscScalarKokkosView xv;
363   PetscScalarKokkosView      yv;
364   KokkosCsrMatrix            csrmat;
365 
366   PetscFunctionBegin;
367   PetscCall(PetscLogGpuTimeBegin());
368   PetscCall(MatSeqAIJKokkosSyncDevice(A));
369   PetscCall(VecGetKokkosView(xx, &xv));
370   PetscCall(VecGetKokkosViewWrite(yy, &yv));
371   if (A->form_explicit_transpose) {
372     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
373     mode = "N";
374   } else {
375     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
376     csrmat = aijkok->csrmat;
377     mode   = "T";
378   }
379   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
380   PetscCall(VecRestoreKokkosView(xx, &xv));
381   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
382   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
383   PetscCall(PetscLogGpuTimeEnd());
384   PetscFunctionReturn(PETSC_SUCCESS);
385 }
386 
387 /* y = A^H x */
388 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
389 {
390   Mat_SeqAIJKokkos          *aijkok;
391   const char                *mode;
392   ConstPetscScalarKokkosView xv;
393   PetscScalarKokkosView      yv;
394   KokkosCsrMatrix            csrmat;
395 
396   PetscFunctionBegin;
397   PetscCall(PetscLogGpuTimeBegin());
398   PetscCall(MatSeqAIJKokkosSyncDevice(A));
399   PetscCall(VecGetKokkosView(xx, &xv));
400   PetscCall(VecGetKokkosViewWrite(yy, &yv));
401   if (A->form_explicit_transpose) {
402     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
403     mode = "N";
404   } else {
405     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
406     csrmat = aijkok->csrmat;
407     mode   = "C";
408   }
409   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
410   PetscCall(VecRestoreKokkosView(xx, &xv));
411   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
412   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
413   PetscCall(PetscLogGpuTimeEnd());
414   PetscFunctionReturn(PETSC_SUCCESS);
415 }
416 
417 /* z = A x + y */
418 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
419 {
420   Mat_SeqAIJKokkos          *aijkok;
421   ConstPetscScalarKokkosView xv, yv;
422   PetscScalarKokkosView      zv;
423 
424   PetscFunctionBegin;
425   PetscCall(PetscLogGpuTimeBegin());
426   PetscCall(MatSeqAIJKokkosSyncDevice(A));
427   PetscCall(VecGetKokkosView(xx, &xv));
428   PetscCall(VecGetKokkosView(yy, &yv));
429   PetscCall(VecGetKokkosViewWrite(zz, &zv));
430   if (zz != yy) Kokkos::deep_copy(zv, yv);
431   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
432   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
433   PetscCall(VecRestoreKokkosView(xx, &xv));
434   PetscCall(VecRestoreKokkosView(yy, &yv));
435   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
436   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
437   PetscCall(PetscLogGpuTimeEnd());
438   PetscFunctionReturn(PETSC_SUCCESS);
439 }
440 
441 /* z = A^T x + y */
442 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
443 {
444   Mat_SeqAIJKokkos          *aijkok;
445   const char                *mode;
446   ConstPetscScalarKokkosView xv, yv;
447   PetscScalarKokkosView      zv;
448   KokkosCsrMatrix            csrmat;
449 
450   PetscFunctionBegin;
451   PetscCall(PetscLogGpuTimeBegin());
452   PetscCall(MatSeqAIJKokkosSyncDevice(A));
453   PetscCall(VecGetKokkosView(xx, &xv));
454   PetscCall(VecGetKokkosView(yy, &yv));
455   PetscCall(VecGetKokkosViewWrite(zz, &zv));
456   if (zz != yy) Kokkos::deep_copy(zv, yv);
457   if (A->form_explicit_transpose) {
458     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
459     mode = "N";
460   } else {
461     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
462     csrmat = aijkok->csrmat;
463     mode   = "T";
464   }
465   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
466   PetscCall(VecRestoreKokkosView(xx, &xv));
467   PetscCall(VecRestoreKokkosView(yy, &yv));
468   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
469   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
470   PetscCall(PetscLogGpuTimeEnd());
471   PetscFunctionReturn(PETSC_SUCCESS);
472 }
473 
474 /* z = A^H x + y */
475 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
476 {
477   Mat_SeqAIJKokkos          *aijkok;
478   const char                *mode;
479   ConstPetscScalarKokkosView xv, yv;
480   PetscScalarKokkosView      zv;
481   KokkosCsrMatrix            csrmat;
482 
483   PetscFunctionBegin;
484   PetscCall(PetscLogGpuTimeBegin());
485   PetscCall(MatSeqAIJKokkosSyncDevice(A));
486   PetscCall(VecGetKokkosView(xx, &xv));
487   PetscCall(VecGetKokkosView(yy, &yv));
488   PetscCall(VecGetKokkosViewWrite(zz, &zv));
489   if (zz != yy) Kokkos::deep_copy(zv, yv);
490   if (A->form_explicit_transpose) {
491     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
492     mode = "N";
493   } else {
494     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
495     csrmat = aijkok->csrmat;
496     mode   = "C";
497   }
498   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
499   PetscCall(VecRestoreKokkosView(xx, &xv));
500   PetscCall(VecRestoreKokkosView(yy, &yv));
501   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
502   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
503   PetscCall(PetscLogGpuTimeEnd());
504   PetscFunctionReturn(PETSC_SUCCESS);
505 }
506 
507 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
508 {
509   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
510 
511   PetscFunctionBegin;
512   switch (op) {
513   case MAT_FORM_EXPLICIT_TRANSPOSE:
514     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
515     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
516     A->form_explicit_transpose = flg;
517     break;
518   default:
519     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
520     break;
521   }
522   PetscFunctionReturn(PETSC_SUCCESS);
523 }
524 
525 /* Depending on reuse, either build a new mat, or use the existing mat */
526 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
527 {
528   Mat_SeqAIJ *aseq;
529 
530   PetscFunctionBegin;
531   PetscCall(PetscKokkosInitializeCheck());
532   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
533     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
534   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
535     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
536   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
537     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
538     PetscCall(PetscFree(A->defaultvectype));
539     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
540     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
541     PetscCall(MatSetOps_SeqAIJKokkos(A));
542     aseq = static_cast<Mat_SeqAIJ *>(A->data);
543     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
544       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
545       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
546     }
547   }
548   PetscFunctionReturn(PETSC_SUCCESS);
549 }
550 
551 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
552    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
553  */
554 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
555 {
556   Mat_SeqAIJ       *bseq;
557   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
558   Mat               mat;
559 
560   PetscFunctionBegin;
561   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
562   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
563   mat = *B;
564   if (A->preallocated) {
565     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
566     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
567     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
568     /* Now copy values to B if needed */
569     if (dupOption == MAT_COPY_VALUES) {
570       if (akok->a_dual.need_sync_device()) {
571         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
572         bkok->a_dual.modify_host();
573       } else { /* If device has the latest data, we only copy data on device */
574         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
575         bkok->a_dual.modify_device();
576       }
577     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
578       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
579       bkok->a_dual.modify_host();
580     }
581     mat->spptr = bkok;
582   }
583 
584   PetscCall(PetscFree(mat->defaultvectype));
585   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
586   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
587   PetscCall(MatSetOps_SeqAIJKokkos(mat));
588   PetscFunctionReturn(PETSC_SUCCESS);
589 }
590 
591 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
592 {
593   Mat               At;
594   KokkosCsrMatrix   internT;
595   Mat_SeqAIJKokkos *atkok, *bkok;
596 
597   PetscFunctionBegin;
598   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
599   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
600   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
601     /* Deep copy internT, as we want to isolate the internal transpose */
602     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
603     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
604     if (reuse == MAT_INITIAL_MATRIX) *B = At;
605     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
606   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
607     if ((*B)->assembled) {
608       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
609       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
610       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
611     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
612       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
613       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
614       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
615       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
616       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
617     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
618   }
619   PetscFunctionReturn(PETSC_SUCCESS);
620 }
621 
622 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
623 {
624   Mat_SeqAIJKokkos *aijkok;
625 
626   PetscFunctionBegin;
627   if (A->factortype == MAT_FACTOR_NONE) {
628     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
629     delete aijkok;
630   } else {
631     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
632   }
633   A->spptr = NULL;
634   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
635   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
636   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
637   PetscCall(MatDestroy_SeqAIJ(A));
638   PetscFunctionReturn(PETSC_SUCCESS);
639 }
640 
641 /*MC
642    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
643 
644    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
645 
646    Options Database Key:
647 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
648 
649   Level: beginner
650 
651 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
652 M*/
653 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
654 {
655   PetscFunctionBegin;
656   PetscCall(PetscKokkosInitializeCheck());
657   PetscCall(MatCreate_SeqAIJ(A));
658   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
659   PetscFunctionReturn(PETSC_SUCCESS);
660 }
661 
662 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
663 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
664 {
665   Mat_SeqAIJ         *a, *b;
666   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
667   MatScalarKokkosView aa, ba, ca;
668   MatRowMapKokkosView ai, bi, ci;
669   MatColIdxKokkosView aj, bj, cj;
670   PetscInt            m, n, nnz, aN;
671 
672   PetscFunctionBegin;
673   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
674   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
675   PetscAssertPointer(C, 4);
676   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
677   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
678   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
679   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
680 
681   PetscCall(MatSeqAIJKokkosSyncDevice(A));
682   PetscCall(MatSeqAIJKokkosSyncDevice(B));
683   a    = static_cast<Mat_SeqAIJ *>(A->data);
684   b    = static_cast<Mat_SeqAIJ *>(B->data);
685   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
686   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
687   aa   = akok->a_dual.view_device();
688   ai   = akok->i_dual.view_device();
689   ba   = bkok->a_dual.view_device();
690   bi   = bkok->i_dual.view_device();
691   m    = A->rmap->n; /* M, N and nnz of C */
692   n    = A->cmap->n + B->cmap->n;
693   nnz  = a->nz + b->nz;
694   aN   = A->cmap->n; /* N of A */
695   if (reuse == MAT_INITIAL_MATRIX) {
696     aj           = akok->j_dual.view_device();
697     bj           = bkok->j_dual.view_device();
698     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
699     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
700     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
701     ca           = ca_dual.view_device();
702     ci           = ci_dual.view_device();
703     cj           = cj_dual.view_device();
704 
705     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
706     Kokkos::parallel_for(
707       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
708         PetscInt i       = t.league_rank(); /* row i */
709         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
710 
711         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
712                                                    ci(i) = coffset;
713                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
714         });
715 
716         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
717           if (k < alen) {
718             ca(coffset + k) = aa(ai(i) + k);
719             cj(coffset + k) = aj(ai(i) + k);
720           } else {
721             ca(coffset + k) = ba(bi(i) + k - alen);
722             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
723           }
724         });
725       });
726     ca_dual.modify_device();
727     ci_dual.modify_device();
728     cj_dual.modify_device();
729     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
730     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
731   } else if (reuse == MAT_REUSE_MATRIX) {
732     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
733     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
734     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
735     ca   = ckok->a_dual.view_device();
736     ci   = ckok->i_dual.view_device();
737 
738     Kokkos::parallel_for(
739       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
740         PetscInt i    = t.league_rank(); /* row i */
741         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
742         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
743           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
744           else ca(ci(i) + k) = ba(bi(i) + k - alen);
745         });
746       });
747     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
748   }
749   PetscFunctionReturn(PETSC_SUCCESS);
750 }
751 
752 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
753 {
754   PetscFunctionBegin;
755   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
756   PetscFunctionReturn(PETSC_SUCCESS);
757 }
758 
759 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
760 {
761   Mat_Product                 *product = C->product;
762   Mat                          A, B;
763   bool                         transA, transB; /* use bool, since KK needs this type */
764   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
765   Mat_SeqAIJ                  *c;
766   MatProductData_SeqAIJKokkos *pdata;
767   KokkosCsrMatrix              csrmatA, csrmatB;
768 
769   PetscFunctionBegin;
770   MatCheckProduct(C, 1);
771   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
772   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
773 
774   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
775   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
776   // we still do numeric.
777   if (pdata->reusesym) { // numeric reuses results from symbolic
778     pdata->reusesym = PETSC_FALSE;
779     PetscFunctionReturn(PETSC_SUCCESS);
780   }
781 
782   switch (product->type) {
783   case MATPRODUCT_AB:
784     transA = false;
785     transB = false;
786     break;
787   case MATPRODUCT_AtB:
788     transA = true;
789     transB = false;
790     break;
791   case MATPRODUCT_ABt:
792     transA = false;
793     transB = true;
794     break;
795   default:
796     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
797   }
798 
799   A = product->A;
800   B = product->B;
801   PetscCall(MatSeqAIJKokkosSyncDevice(A));
802   PetscCall(MatSeqAIJKokkosSyncDevice(B));
803   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
804   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
805   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
806 
807   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
808 
809   csrmatA = akok->csrmat;
810   csrmatB = bkok->csrmat;
811 
812   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
813   if (transA) {
814     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
815     transA = false;
816   }
817 
818   if (transB) {
819     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
820     transB = false;
821   }
822   PetscCall(PetscLogGpuTimeBegin());
823   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
824 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
825   auto spgemmHandle = pdata->kh.get_spgemm_handle();
826   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
827 #endif
828 
829   PetscCall(PetscLogGpuTimeEnd());
830   PetscCall(MatSeqAIJKokkosModifyDevice(C));
831   /* shorter version of MatAssemblyEnd_SeqAIJ */
832   c = (Mat_SeqAIJ *)C->data;
833   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
834   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
835   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
836   c->reallocs         = 0;
837   C->info.mallocs     = 0;
838   C->info.nz_unneeded = 0;
839   C->assembled = C->was_assembled = PETSC_TRUE;
840   C->num_ass++;
841   PetscFunctionReturn(PETSC_SUCCESS);
842 }
843 
844 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
845 {
846   Mat_Product                 *product = C->product;
847   MatProductType               ptype;
848   Mat                          A, B;
849   bool                         transA, transB;
850   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
851   MatProductData_SeqAIJKokkos *pdata;
852   MPI_Comm                     comm;
853   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
854 
855   PetscFunctionBegin;
856   MatCheckProduct(C, 1);
857   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
858   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
859   A = product->A;
860   B = product->B;
861   PetscCall(MatSeqAIJKokkosSyncDevice(A));
862   PetscCall(MatSeqAIJKokkosSyncDevice(B));
863   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
864   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
865   csrmatA = akok->csrmat;
866   csrmatB = bkok->csrmat;
867 
868   ptype = product->type;
869   // Take advantage of the symmetry if true
870   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
871     ptype                                          = MATPRODUCT_AB;
872     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
873   }
874   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
875     ptype                                          = MATPRODUCT_AB;
876     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
877   }
878 
879   switch (ptype) {
880   case MATPRODUCT_AB:
881     transA = false;
882     transB = false;
883     break;
884   case MATPRODUCT_AtB:
885     transA = true;
886     transB = false;
887     break;
888   case MATPRODUCT_ABt:
889     transA = false;
890     transB = true;
891     break;
892   default:
893     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
894   }
895   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
896   pdata->reusesym = product->api_user;
897 
898   /* TODO: add command line options to select spgemm algorithms */
899   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
900 
901   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
902 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
903   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
904   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
905   #endif
906 #endif
907   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
908 
909   PetscCall(PetscLogGpuTimeBegin());
910   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
911   if (transA) {
912     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
913     transA = false;
914   }
915 
916   if (transB) {
917     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
918     transB = false;
919   }
920 
921   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
922   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
923     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
924     calling new Mat_SeqAIJKokkos().
925     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
926   */
927   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
928 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
929   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
930   auto spgemmHandle = pdata->kh.get_spgemm_handle();
931   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
932 #endif
933   PetscCall(PetscLogGpuTimeEnd());
934 
935   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
936   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
937   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
938   PetscFunctionReturn(PETSC_SUCCESS);
939 }
940 
941 /* handles sparse matrix matrix ops */
942 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
943 {
944   Mat_Product *product = mat->product;
945   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
946 
947   PetscFunctionBegin;
948   MatCheckProduct(mat, 1);
949   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
950   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
951   if (Biskok && Ciskok) {
952     switch (product->type) {
953     case MATPRODUCT_AB:
954     case MATPRODUCT_AtB:
955     case MATPRODUCT_ABt:
956       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
957       break;
958     case MATPRODUCT_PtAP:
959     case MATPRODUCT_RARt:
960     case MATPRODUCT_ABC:
961       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
962       break;
963     default:
964       break;
965     }
966   } else { /* fallback for AIJ */
967     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
968   }
969   PetscFunctionReturn(PETSC_SUCCESS);
970 }
971 
972 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
973 {
974   Mat_SeqAIJKokkos *aijkok;
975 
976   PetscFunctionBegin;
977   PetscCall(PetscLogGpuTimeBegin());
978   PetscCall(MatSeqAIJKokkosSyncDevice(A));
979   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
980   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
981   PetscCall(MatSeqAIJKokkosModifyDevice(A));
982   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
983   PetscCall(PetscLogGpuTimeEnd());
984   PetscFunctionReturn(PETSC_SUCCESS);
985 }
986 
987 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
988 {
989   Mat_SeqAIJKokkos *aijkok;
990 
991   PetscFunctionBegin;
992   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
993   if (aijkok) { /* Only zero the device if data is already there */
994     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
995     PetscCall(MatSeqAIJKokkosModifyDevice(A));
996   } else { /* Might be preallocated but not assembled */
997     PetscCall(MatZeroEntries_SeqAIJ(A));
998   }
999   PetscFunctionReturn(PETSC_SUCCESS);
1000 }
1001 
1002 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1003 {
1004   Mat_SeqAIJ           *aijseq;
1005   Mat_SeqAIJKokkos     *aijkok;
1006   PetscInt              n;
1007   PetscScalarKokkosView xv;
1008 
1009   PetscFunctionBegin;
1010   PetscCall(VecGetLocalSize(x, &n));
1011   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1012   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1013 
1014   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1015   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1016 
1017   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
1018     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1019     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1020     aijkok->SetDiagonal(aijseq->diag);
1021   }
1022 
1023   const auto &Aa    = aijkok->a_dual.view_device();
1024   const auto &Ai    = aijkok->i_dual.view_device();
1025   const auto &Adiag = aijkok->diag_dual.view_device();
1026 
1027   PetscCall(VecGetKokkosViewWrite(x, &xv));
1028   Kokkos::parallel_for(
1029     n, KOKKOS_LAMBDA(const PetscInt i) {
1030       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1031       else xv(i) = 0;
1032     });
1033   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1034   PetscFunctionReturn(PETSC_SUCCESS);
1035 }
1036 
1037 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1038 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1039 {
1040   Mat_SeqAIJKokkos *aijkok;
1041 
1042   PetscFunctionBegin;
1043   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1044   PetscAssertPointer(kv, 2);
1045   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1046   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1047   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1048   *kv    = aijkok->a_dual.view_device();
1049   PetscFunctionReturn(PETSC_SUCCESS);
1050 }
1051 
1052 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1053 {
1054   PetscFunctionBegin;
1055   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1056   PetscAssertPointer(kv, 2);
1057   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1058   PetscFunctionReturn(PETSC_SUCCESS);
1059 }
1060 
1061 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1062 {
1063   Mat_SeqAIJKokkos *aijkok;
1064 
1065   PetscFunctionBegin;
1066   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1067   PetscAssertPointer(kv, 2);
1068   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1069   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1070   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1071   *kv    = aijkok->a_dual.view_device();
1072   PetscFunctionReturn(PETSC_SUCCESS);
1073 }
1074 
1075 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1076 {
1077   PetscFunctionBegin;
1078   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1079   PetscAssertPointer(kv, 2);
1080   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1081   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1082   PetscFunctionReturn(PETSC_SUCCESS);
1083 }
1084 
1085 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1086 {
1087   Mat_SeqAIJKokkos *aijkok;
1088 
1089   PetscFunctionBegin;
1090   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1091   PetscAssertPointer(kv, 2);
1092   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1093   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1094   *kv    = aijkok->a_dual.view_device();
1095   PetscFunctionReturn(PETSC_SUCCESS);
1096 }
1097 
1098 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1099 {
1100   PetscFunctionBegin;
1101   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1102   PetscAssertPointer(kv, 2);
1103   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1104   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1105   PetscFunctionReturn(PETSC_SUCCESS);
1106 }
1107 
1108 /* Computes Y += alpha X */
1109 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1110 {
1111   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1112   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1113   ConstMatScalarKokkosView Xa;
1114   MatScalarKokkosView      Ya;
1115 
1116   PetscFunctionBegin;
1117   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1118   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1119   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1120   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1121   PetscCall(PetscLogGpuTimeBegin());
1122 
1123   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1124     PetscBool e;
1125     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1126     if (e) {
1127       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1128       if (e) pattern = SAME_NONZERO_PATTERN;
1129     }
1130   }
1131 
1132   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1133     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1134     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1135     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1136   */
1137   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1138   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1139   Xa   = xkok->a_dual.view_device();
1140   Ya   = ykok->a_dual.view_device();
1141 
1142   if (pattern == SAME_NONZERO_PATTERN) {
1143     KokkosBlas::axpy(alpha, Xa, Ya);
1144     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1145   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1146     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1147     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1148 
1149     Kokkos::parallel_for(
1150       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1151         PetscInt i = t.league_rank(); // row i
1152         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1153           // Only one thread works in a team
1154           PetscInt p, q = Yi(i);
1155           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1156             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1157             if (Xj(p) == Yj(q)) {                        // Found it
1158               Ya(q) += alpha * Xa(p);
1159               q++;
1160             } else {
1161             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1162             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1163 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1164               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1165 #else
1166               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1167 #endif
1168             }
1169           }
1170         });
1171       });
1172     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1173   } else { // different nonzero patterns
1174     Mat             Z;
1175     KokkosCsrMatrix zcsr;
1176     KernelHandle    kh;
1177     kh.create_spadd_handle(true); // X, Y are sorted
1178     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1179     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1180     zkok = new Mat_SeqAIJKokkos(zcsr);
1181     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1182     PetscCall(MatHeaderReplace(Y, &Z));
1183     kh.destroy_spadd_handle();
1184   }
1185   PetscCall(PetscLogGpuTimeEnd());
1186   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1187   PetscFunctionReturn(PETSC_SUCCESS);
1188 }
1189 
1190 struct MatCOOStruct_SeqAIJKokkos {
1191   PetscCount           n;
1192   PetscCount           Atot;
1193   PetscInt             nz;
1194   PetscCountKokkosView jmap;
1195   PetscCountKokkosView perm;
1196 
1197   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1198   {
1199     nz   = coo_h->nz;
1200     n    = coo_h->n;
1201     Atot = coo_h->Atot;
1202     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1203     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1204   }
1205 };
1206 
1207 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
1208 {
1209   PetscFunctionBegin;
1210   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
1211   PetscFunctionReturn(PETSC_SUCCESS);
1212 }
1213 
1214 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1215 {
1216   Mat_SeqAIJKokkos          *akok;
1217   Mat_SeqAIJ                *aseq;
1218   PetscContainer             container_h, container_d;
1219   MatCOOStruct_SeqAIJ       *coo_h;
1220   MatCOOStruct_SeqAIJKokkos *coo_d;
1221 
1222   PetscFunctionBegin;
1223   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1224   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1225   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1226   delete akok;
1227   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1228   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1229 
1230   // Copy the COO struct to device
1231   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1232   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1233   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1234 
1235   // Put the COO struct in a container and then attach that to the matrix
1236   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
1237   PetscCall(PetscContainerSetPointer(container_d, coo_d));
1238   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos));
1239   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
1240   PetscCall(PetscContainerDestroy(&container_d));
1241   PetscFunctionReturn(PETSC_SUCCESS);
1242 }
1243 
1244 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1245 {
1246   MatScalarKokkosView        Aa;
1247   ConstMatScalarKokkosView   kv;
1248   PetscMemType               memtype;
1249   PetscContainer             container;
1250   MatCOOStruct_SeqAIJKokkos *coo;
1251 
1252   PetscFunctionBegin;
1253   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1254   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1255 
1256   const auto &n    = coo->n;
1257   const auto &Annz = coo->nz;
1258   const auto &jmap = coo->jmap;
1259   const auto &perm = coo->perm;
1260 
1261   PetscCall(PetscGetMemType(v, &memtype));
1262   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1263     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1264   } else {
1265     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1266   }
1267 
1268   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1269   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1270 
1271   PetscCall(PetscLogGpuTimeBegin());
1272   Kokkos::parallel_for(
1273     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1274       PetscScalar sum = 0.0;
1275       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1276       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1277     });
1278   PetscCall(PetscLogGpuTimeEnd());
1279 
1280   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1281   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1282   PetscFunctionReturn(PETSC_SUCCESS);
1283 }
1284 
1285 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1286 {
1287   PetscFunctionBegin;
1288   PetscCall(MatSeqAIJKokkosSyncHost(A));
1289   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1290   B->offloadmask = PETSC_OFFLOAD_CPU;
1291   PetscFunctionReturn(PETSC_SUCCESS);
1292 }
1293 
1294 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1295 {
1296   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1297 
1298   PetscFunctionBegin;
1299   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1300   A->boundtocpu  = PETSC_FALSE;
1301 
1302   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1303   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1304   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1305   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1306   A->ops->scale                     = MatScale_SeqAIJKokkos;
1307   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1308   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1309   A->ops->mult                      = MatMult_SeqAIJKokkos;
1310   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1311   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1312   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1313   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1314   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1315   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1316   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1317   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1318   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1319   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1320   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1321   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1322   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1323   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1324   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1325   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1326 
1327   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1328   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1329   PetscFunctionReturn(PETSC_SUCCESS);
1330 }
1331 
1332 /*
1333    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1334 
1335   Input Parameters:
1336 +  A       - the MATSEQAIJKOKKOS matrix
1337 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1338 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1339 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1340 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1341 
1342   Output Parameter:
1343 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1344 */
1345 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1346 {
1347   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1348   PetscInt          N       = A->rmap->n;
1349   PetscInt          nblocks = bs.extent(0) - 1;
1350 
1351   PetscFunctionBegin;
1352   // Set the diagonal pointer on device if not already
1353   if (N && akok->diag_dual.extent(0) == 0) {
1354     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1355     akok->SetDiagonal(static_cast<Mat_SeqAIJ *>(A->data)->diag);
1356   }
1357 
1358   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1359 
1360   // Pull out the diagonal blocks of the matrix and then invert the blocks
1361   auto Aa    = akok->a_dual.view_device();
1362   auto Ai    = akok->i_dual.view_device();
1363   auto Aj    = akok->j_dual.view_device();
1364   auto Adiag = akok->diag_dual.view_device();
1365   // TODO: how to tune the team size?
1366 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
1367   auto ts = Kokkos::AUTO();
1368 #else
1369   auto ts         = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1370 #endif
1371   PetscCallCXX(Kokkos::parallel_for(
1372     Kokkos::TeamPolicy<>(nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1373       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1374       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1375       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1376       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1377       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1378 
1379       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1380         PetscInt i = rstart + r;                                                            // i-th row in A
1381 
1382         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1383           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1384 
1385           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1386             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1387               B(r, c) = 0.0;
1388             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1389               B(r, c) = Aa(first + c);
1390             } else { // this entry does not show up in the CSR
1391               B(r, c) = 0.0;
1392             }
1393           }
1394         } else { // rare case that the diagonal does not exist
1395           const PetscInt begin = Ai(i);
1396           const PetscInt end   = Ai(i + 1);
1397           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1398           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1399             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1400             else if (Aj(j) >= rstart + m) break;
1401           }
1402         }
1403       });
1404 
1405       // LU-decompose B (w/o pivoting) and then invert B
1406       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1407       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1408     }));
1409   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1410   PetscFunctionReturn(PETSC_SUCCESS);
1411 }
1412 
1413 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1414 {
1415   Mat_SeqAIJ *aseq;
1416   PetscInt    i, m, n;
1417 
1418   PetscFunctionBegin;
1419   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1420 
1421   m = akok->nrows();
1422   n = akok->ncols();
1423   PetscCall(MatSetSizes(A, m, n, m, n));
1424   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1425 
1426   /* Set up data structures of A as a MATSEQAIJ */
1427   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1428   aseq = (Mat_SeqAIJ *)(A)->data;
1429 
1430   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1431   akok->j_dual.sync_host();
1432 
1433   aseq->i            = akok->i_host_data();
1434   aseq->j            = akok->j_host_data();
1435   aseq->a            = akok->a_host_data();
1436   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1437   aseq->singlemalloc = PETSC_FALSE;
1438   aseq->free_a       = PETSC_FALSE;
1439   aseq->free_ij      = PETSC_FALSE;
1440   aseq->nz           = akok->nnz();
1441   aseq->maxnz        = aseq->nz;
1442 
1443   PetscCall(PetscMalloc1(m, &aseq->imax));
1444   PetscCall(PetscMalloc1(m, &aseq->ilen));
1445   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1446 
1447   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1448   akok->nonzerostate = A->nonzerostate;
1449   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1450   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1451   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1452   PetscFunctionReturn(PETSC_SUCCESS);
1453 }
1454 
1455 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1456 {
1457   PetscFunctionBegin;
1458   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1459   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1460   PetscFunctionReturn(PETSC_SUCCESS);
1461 }
1462 
1463 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1464 {
1465   Mat_SeqAIJKokkos *akok;
1466   PetscFunctionBegin;
1467   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1468   PetscCall(MatCreate(comm, A));
1469   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1470   PetscFunctionReturn(PETSC_SUCCESS);
1471 }
1472 
1473 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1474 
1475    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1476  */
1477 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1478 {
1479   PetscFunctionBegin;
1480   PetscCall(MatCreate(comm, A));
1481   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1482   PetscFunctionReturn(PETSC_SUCCESS);
1483 }
1484 
1485 /*@C
1486   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1487   (the default parallel PETSc format). This matrix will ultimately be handled by
1488   Kokkos for calculations.
1489 
1490   Collective
1491 
1492   Input Parameters:
1493 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1494 . m    - number of rows
1495 . n    - number of columns
1496 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1497 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1498 
1499   Output Parameter:
1500 . A - the matrix
1501 
1502   Level: intermediate
1503 
1504   Notes:
1505   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1506   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1507   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1508 
1509   The AIJ format, also called
1510   compressed row storage, is fully compatible with standard Fortran
1511   storage.  That is, the stored row and column indices can begin at
1512   either one (as in Fortran) or zero.
1513 
1514   Specify the preallocated storage with either `nz` or `nnz` (not both).
1515   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1516   allocation.
1517 
1518 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1519 @*/
1520 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1521 {
1522   PetscFunctionBegin;
1523   PetscCall(PetscKokkosInitializeCheck());
1524   PetscCall(MatCreate(comm, A));
1525   PetscCall(MatSetSizes(*A, m, n, m, n));
1526   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1527   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1528   PetscFunctionReturn(PETSC_SUCCESS);
1529 }
1530 
1531 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1532 {
1533   PetscFunctionBegin;
1534   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1535   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1536   PetscFunctionReturn(PETSC_SUCCESS);
1537 }
1538 
1539 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1540 {
1541   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1542 
1543   PetscFunctionBegin;
1544   if (!factors->sptrsv_symbolic_completed) {
1545     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1546     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1547     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1548   }
1549   PetscFunctionReturn(PETSC_SUCCESS);
1550 }
1551 
1552 /* Check if we need to update factors etc for transpose solve */
1553 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1554 {
1555   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1556   MatColIdxType               n       = A->rmap->n;
1557 
1558   PetscFunctionBegin;
1559   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1560     /* Update L^T and do sptrsv symbolic */
1561     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1562     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1563     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1564 
1565     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1566                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1567 
1568     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1569       We have to sort the indices, until KK provides finer control options.
1570     */
1571     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1572 
1573     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1574 
1575     /* Update U^T and do sptrsv symbolic */
1576     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1577     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1578     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1579 
1580     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1581                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1582 
1583     /* Sort indices. See comments above */
1584     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1585 
1586     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1587     factors->transpose_updated = PETSC_TRUE;
1588   }
1589   PetscFunctionReturn(PETSC_SUCCESS);
1590 }
1591 
1592 /* Solve Ax = b, with A = LU */
1593 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1594 {
1595   ConstPetscScalarKokkosView  bv;
1596   PetscScalarKokkosView       xv;
1597   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1598 
1599   PetscFunctionBegin;
1600   PetscCall(PetscLogGpuTimeBegin());
1601   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1602   PetscCall(VecGetKokkosView(b, &bv));
1603   PetscCall(VecGetKokkosViewWrite(x, &xv));
1604   /* Solve L tmpv = b */
1605   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1606   /* Solve Ux = tmpv */
1607   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1608   PetscCall(VecRestoreKokkosView(b, &bv));
1609   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1610   PetscCall(PetscLogGpuTimeEnd());
1611   PetscFunctionReturn(PETSC_SUCCESS);
1612 }
1613 
1614 /* Solve A^T x = b, where A^T = U^T L^T */
1615 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1616 {
1617   ConstPetscScalarKokkosView  bv;
1618   PetscScalarKokkosView       xv;
1619   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1620 
1621   PetscFunctionBegin;
1622   PetscCall(PetscLogGpuTimeBegin());
1623   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1624   PetscCall(VecGetKokkosView(b, &bv));
1625   PetscCall(VecGetKokkosViewWrite(x, &xv));
1626   /* Solve U^T tmpv = b */
1627   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1628 
1629   /* Solve L^T x = tmpv */
1630   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1631   PetscCall(VecRestoreKokkosView(b, &bv));
1632   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1633   PetscCall(PetscLogGpuTimeEnd());
1634   PetscFunctionReturn(PETSC_SUCCESS);
1635 }
1636 
1637 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1638 {
1639   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1640   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1641   PetscInt                    fill_lev = info->levels;
1642 
1643   PetscFunctionBegin;
1644   PetscCall(PetscLogGpuTimeBegin());
1645   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1646 
1647   auto a_d = aijkok->a_dual.view_device();
1648   auto i_d = aijkok->i_dual.view_device();
1649   auto j_d = aijkok->j_dual.view_device();
1650 
1651   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1652 
1653   B->assembled              = PETSC_TRUE;
1654   B->preallocated           = PETSC_TRUE;
1655   B->ops->solve             = MatSolve_SeqAIJKokkos;
1656   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1657   B->ops->matsolve          = NULL;
1658   B->ops->matsolvetranspose = NULL;
1659   B->offloadmask            = PETSC_OFFLOAD_GPU;
1660 
1661   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1662   factors->transpose_updated         = PETSC_FALSE;
1663   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1664   /* TODO: log flops, but how to know that? */
1665   PetscCall(PetscLogGpuTimeEnd());
1666   PetscFunctionReturn(PETSC_SUCCESS);
1667 }
1668 
1669 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1670 {
1671   Mat_SeqAIJKokkos           *aijkok;
1672   Mat_SeqAIJ                 *b;
1673   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1674   PetscInt                    fill_lev = info->levels;
1675   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1676   PetscInt                    n        = A->rmap->n;
1677 
1678   PetscFunctionBegin;
1679   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1680   /* Rebuild factors */
1681   if (factors) {
1682     factors->Destroy();
1683   } /* Destroy the old if it exists */
1684   else {
1685     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1686   }
1687 
1688   /* Create a spiluk handle and then do symbolic factorization */
1689   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1690   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1691 
1692   auto spiluk_handle = factors->kh.get_spiluk_handle();
1693 
1694   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1695   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1696   Kokkos::realloc(factors->iU_d, n + 1);
1697   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1698 
1699   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1700   auto i_d = aijkok->i_dual.view_device();
1701   auto j_d = aijkok->j_dual.view_device();
1702   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1703   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1704 
1705   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1706   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1707   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1708   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1709 
1710   /* TODO: add options to select sptrsv algorithms */
1711   /* Create sptrsv handles for L, U and their transpose */
1712 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1713   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1714 #else
1715   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1716 #endif
1717 
1718   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1719   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1720   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1721   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1722 
1723   /* Fill fields of the factor matrix B */
1724   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1725   b     = (Mat_SeqAIJ *)B->data;
1726   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1727   B->info.fill_ratio_given  = info->fill;
1728   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1729 
1730   B->offloadmask          = PETSC_OFFLOAD_GPU;
1731   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1732   PetscFunctionReturn(PETSC_SUCCESS);
1733 }
1734 
1735 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1736 {
1737   PetscFunctionBegin;
1738   *type = MATSOLVERKOKKOS;
1739   PetscFunctionReturn(PETSC_SUCCESS);
1740 }
1741 
1742 /*MC
1743   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1744   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1745 
1746   Level: beginner
1747 
1748 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1749 M*/
1750 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1751 {
1752   PetscInt n = A->rmap->n;
1753 
1754   PetscFunctionBegin;
1755   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1756   PetscCall(MatSetSizes(*B, n, n, n, n));
1757   (*B)->factortype = ftype;
1758   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1759   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1760 
1761   if (ftype == MAT_FACTOR_LU) {
1762     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1763     (*B)->canuseordering        = PETSC_TRUE;
1764     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1765   } else if (ftype == MAT_FACTOR_ILU) {
1766     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1767     (*B)->canuseordering         = PETSC_FALSE;
1768     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1769   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1770 
1771   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1772   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1773   PetscFunctionReturn(PETSC_SUCCESS);
1774 }
1775 
1776 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1777 {
1778   PetscFunctionBegin;
1779   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1780   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1781   PetscFunctionReturn(PETSC_SUCCESS);
1782 }
1783 
1784 /* Utility to print out a KokkosCsrMatrix for debugging */
1785 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1786 {
1787   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1788   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1789   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1790   const PetscInt    *i  = iv.data();
1791   const PetscInt    *j  = jv.data();
1792   const PetscScalar *a  = av.data();
1793   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1794 
1795   PetscFunctionBegin;
1796   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1797   for (PetscInt k = 0; k < m; k++) {
1798     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1799     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1800     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1801   }
1802   PetscFunctionReturn(PETSC_SUCCESS);
1803 }
1804