xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision db9fc945a3c8a0a4b4e9c7cf33c611e450ed8efd)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 #include <KokkosBatched_LU_Decl.hpp>
17 #include <KokkosBatched_InverseLU_Decl.hpp>
18 
19 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
20 
21 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
22   #include <KokkosSparse_Utils.hpp>
23 using KokkosSparse::sort_crs_matrix;
24 using KokkosSparse::Impl::transpose_matrix;
25 #else
26   #include <KokkosKernels_Sorting.hpp>
27 using KokkosKernels::sort_crs_matrix;
28 using KokkosKernels::Impl::transpose_matrix;
29 #endif
30 
31 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
32 
33 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
34    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
35    In the latter case, it is important to set a_dual's sync state correctly.
36  */
37 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
38 {
39   Mat_SeqAIJ       *aijseq;
40   Mat_SeqAIJKokkos *aijkok;
41 
42   PetscFunctionBegin;
43   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
44   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
45 
46   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
47   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
48 
49   /* If aijkok does not exist, we just copy i, j to device.
50      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
51      In both cases, we build a new aijkok structure.
52   */
53   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
54     delete aijkok;
55     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
56     A->spptr = aijkok;
57   }
58 
59   PetscFunctionReturn(PETSC_SUCCESS);
60 }
61 
62 /* Sync CSR data to device if not yet */
63 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
64 {
65   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
66 
67   PetscFunctionBegin;
68   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
69   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
70   if (aijkok->a_dual.need_sync_device()) {
71     aijkok->a_dual.sync_device();
72     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
73     aijkok->hermitian_updated = PETSC_FALSE;
74   }
75   PetscFunctionReturn(PETSC_SUCCESS);
76 }
77 
78 /* Mark the CSR data on device as modified */
79 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
80 {
81   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
82 
83   PetscFunctionBegin;
84   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
85   aijkok->a_dual.clear_sync_state();
86   aijkok->a_dual.modify_device();
87   aijkok->transpose_updated = PETSC_FALSE;
88   aijkok->hermitian_updated = PETSC_FALSE;
89   PetscCall(MatSeqAIJInvalidateDiagonal(A));
90   PetscCall(PetscObjectStateIncrease((PetscObject)A));
91   PetscFunctionReturn(PETSC_SUCCESS);
92 }
93 
94 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
95 {
96   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
97 
98   PetscFunctionBegin;
99   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
100   /* We do not expect one needs factors on host  */
101   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
102   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
103   aijkok->a_dual.sync_host();
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 
107 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
108 {
109   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
110 
111   PetscFunctionBegin;
112   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
113     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
114     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
115     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
116   */
117   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
118     aijkok->a_dual.sync_host();
119     *array = aijkok->a_dual.view_host().data();
120   } else { /* Happens when calling MatSetValues on a newly created matrix */
121     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
122   }
123   PetscFunctionReturn(PETSC_SUCCESS);
124 }
125 
126 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
127 {
128   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
129 
130   PetscFunctionBegin;
131   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
132   PetscFunctionReturn(PETSC_SUCCESS);
133 }
134 
135 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
136 {
137   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
138 
139   PetscFunctionBegin;
140   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
141     aijkok->a_dual.sync_host();
142     *array = aijkok->a_dual.view_host().data();
143   } else {
144     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
145   }
146   PetscFunctionReturn(PETSC_SUCCESS);
147 }
148 
149 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
150 {
151   PetscFunctionBegin;
152   *array = NULL;
153   PetscFunctionReturn(PETSC_SUCCESS);
154 }
155 
156 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
157 {
158   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
159 
160   PetscFunctionBegin;
161   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
162     *array = aijkok->a_dual.view_host().data();
163   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
164     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
165   }
166   PetscFunctionReturn(PETSC_SUCCESS);
167 }
168 
169 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
170 {
171   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
172 
173   PetscFunctionBegin;
174   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
175     aijkok->a_dual.clear_sync_state();
176     aijkok->a_dual.modify_host();
177   }
178   PetscFunctionReturn(PETSC_SUCCESS);
179 }
180 
181 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
182 {
183   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
184 
185   PetscFunctionBegin;
186   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
187 
188   if (i) *i = aijkok->i_device_data();
189   if (j) *j = aijkok->j_device_data();
190   if (a) {
191     aijkok->a_dual.sync_device();
192     *a = aijkok->a_device_data();
193   }
194   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
195   PetscFunctionReturn(PETSC_SUCCESS);
196 }
197 
198 /*
199   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
200 
201   Input Parameter:
202 .  A       - the MATSEQAIJKOKKOS matrix
203 
204   Output Parameters:
205 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
206 -  T_d    - the transpose on device, whose value array is allocated but not initialized
207 */
208 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
209 {
210   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
211   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
212   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
213   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
214   MatRowMapType          *Ti = Ti_h.data();
215   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
216   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
217   PetscInt               *Tj   = Tj_h.data();
218   PetscInt               *perm = perm_h.data();
219   PetscInt               *offset;
220 
221   PetscFunctionBegin;
222   // Populate Ti
223   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
224   Ti++;
225   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
226   Ti--;
227   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
228 
229   // Populate Tj and the permutation array
230   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
231   for (PetscInt i = 0; i < m; i++) {
232     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
233       PetscInt r    = Aj[j];                       // row r of T
234       PetscInt disp = Ti[r] + offset[r];
235 
236       Tj[disp]   = i; // col i of T
237       perm[disp] = j;
238       offset[r]++;
239     }
240   }
241   PetscCall(PetscFree(offset));
242 
243   // Sort each row of T, along with the permutation array
244   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
245 
246   // Output perm and T on device
247   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
248   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
249   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
250   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
251   PetscFunctionReturn(PETSC_SUCCESS);
252 }
253 
254 // Generate the transpose on device and cache it internally
255 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
256 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
257 {
258   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
259   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
260   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
261   KokkosCsrMatrix  &T = akok->csrmatT;
262 
263   PetscFunctionBegin;
264   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
265   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device
266 
267   const auto &Aa = akok->a_dual.view_device();
268 
269   if (A->symmetric == PETSC_BOOL3_TRUE) {
270     *csrmatT = akok->csrmat;
271   } else {
272     // See if we already have a cached transpose and its value is up to date
273     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
274       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
275         const auto &perm = akok->transpose_perm; // get the permutation array
276         auto       &Ta   = T.values;
277 
278         PetscCallCXX(Kokkos::parallel_for(
279           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
280       }
281     } else { // Generate T of size n x m for the first time
282       MatRowMapKokkosView perm;
283 
284       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
285       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
286       PetscCallCXX(Kokkos::parallel_for(
287         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
288     }
289     akok->transpose_updated = PETSC_TRUE;
290     *csrmatT                = akok->csrmatT;
291   }
292   PetscFunctionReturn(PETSC_SUCCESS);
293 }
294 
295 // Generate the Hermitian on device and cache it internally
296 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
297 {
298   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
299   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
300   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
301   KokkosCsrMatrix  &T = akok->csrmatH;
302 
303   PetscFunctionBegin;
304   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
305   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
306 
307   const auto &Aa = akok->a_dual.view_device();
308 
309   if (A->hermitian == PETSC_BOOL3_TRUE) {
310     *csrmatH = akok->csrmat;
311   } else {
312     // See if we already have a cached hermitian and its value is up to date
313     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
314       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
315         const auto &perm = akok->transpose_perm; // get the permutation array
316         auto       &Ta   = T.values;
317 
318         PetscCallCXX(Kokkos::parallel_for(
319           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
320       }
321     } else { // Generate T of size n x m for the first time
322       MatRowMapKokkosView perm;
323 
324       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
325       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
326       PetscCallCXX(Kokkos::parallel_for(
327         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
328     }
329     akok->hermitian_updated = PETSC_TRUE;
330     *csrmatH                = akok->csrmatH;
331   }
332   PetscFunctionReturn(PETSC_SUCCESS);
333 }
334 
335 /* y = A x */
336 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
337 {
338   Mat_SeqAIJKokkos          *aijkok;
339   ConstPetscScalarKokkosView xv;
340   PetscScalarKokkosView      yv;
341 
342   PetscFunctionBegin;
343   PetscCall(PetscLogGpuTimeBegin());
344   PetscCall(MatSeqAIJKokkosSyncDevice(A));
345   PetscCall(VecGetKokkosView(xx, &xv));
346   PetscCall(VecGetKokkosViewWrite(yy, &yv));
347   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
348   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
349   PetscCall(VecRestoreKokkosView(xx, &xv));
350   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
351   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
352   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
353   PetscCall(PetscLogGpuTimeEnd());
354   PetscFunctionReturn(PETSC_SUCCESS);
355 }
356 
357 /* y = A^T x */
358 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
359 {
360   Mat_SeqAIJKokkos          *aijkok;
361   const char                *mode;
362   ConstPetscScalarKokkosView xv;
363   PetscScalarKokkosView      yv;
364   KokkosCsrMatrix            csrmat;
365 
366   PetscFunctionBegin;
367   PetscCall(PetscLogGpuTimeBegin());
368   PetscCall(MatSeqAIJKokkosSyncDevice(A));
369   PetscCall(VecGetKokkosView(xx, &xv));
370   PetscCall(VecGetKokkosViewWrite(yy, &yv));
371   if (A->form_explicit_transpose) {
372     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
373     mode = "N";
374   } else {
375     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
376     csrmat = aijkok->csrmat;
377     mode   = "T";
378   }
379   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
380   PetscCall(VecRestoreKokkosView(xx, &xv));
381   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
382   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
383   PetscCall(PetscLogGpuTimeEnd());
384   PetscFunctionReturn(PETSC_SUCCESS);
385 }
386 
387 /* y = A^H x */
388 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
389 {
390   Mat_SeqAIJKokkos          *aijkok;
391   const char                *mode;
392   ConstPetscScalarKokkosView xv;
393   PetscScalarKokkosView      yv;
394   KokkosCsrMatrix            csrmat;
395 
396   PetscFunctionBegin;
397   PetscCall(PetscLogGpuTimeBegin());
398   PetscCall(MatSeqAIJKokkosSyncDevice(A));
399   PetscCall(VecGetKokkosView(xx, &xv));
400   PetscCall(VecGetKokkosViewWrite(yy, &yv));
401   if (A->form_explicit_transpose) {
402     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
403     mode = "N";
404   } else {
405     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
406     csrmat = aijkok->csrmat;
407     mode   = "C";
408   }
409   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
410   PetscCall(VecRestoreKokkosView(xx, &xv));
411   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
412   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
413   PetscCall(PetscLogGpuTimeEnd());
414   PetscFunctionReturn(PETSC_SUCCESS);
415 }
416 
417 /* z = A x + y */
418 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
419 {
420   Mat_SeqAIJKokkos          *aijkok;
421   ConstPetscScalarKokkosView xv, yv;
422   PetscScalarKokkosView      zv;
423 
424   PetscFunctionBegin;
425   PetscCall(PetscLogGpuTimeBegin());
426   PetscCall(MatSeqAIJKokkosSyncDevice(A));
427   PetscCall(VecGetKokkosView(xx, &xv));
428   PetscCall(VecGetKokkosView(yy, &yv));
429   PetscCall(VecGetKokkosViewWrite(zz, &zv));
430   if (zz != yy) Kokkos::deep_copy(zv, yv);
431   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
432   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
433   PetscCall(VecRestoreKokkosView(xx, &xv));
434   PetscCall(VecRestoreKokkosView(yy, &yv));
435   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
436   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
437   PetscCall(PetscLogGpuTimeEnd());
438   PetscFunctionReturn(PETSC_SUCCESS);
439 }
440 
441 /* z = A^T x + y */
442 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
443 {
444   Mat_SeqAIJKokkos          *aijkok;
445   const char                *mode;
446   ConstPetscScalarKokkosView xv, yv;
447   PetscScalarKokkosView      zv;
448   KokkosCsrMatrix            csrmat;
449 
450   PetscFunctionBegin;
451   PetscCall(PetscLogGpuTimeBegin());
452   PetscCall(MatSeqAIJKokkosSyncDevice(A));
453   PetscCall(VecGetKokkosView(xx, &xv));
454   PetscCall(VecGetKokkosView(yy, &yv));
455   PetscCall(VecGetKokkosViewWrite(zz, &zv));
456   if (zz != yy) Kokkos::deep_copy(zv, yv);
457   if (A->form_explicit_transpose) {
458     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
459     mode = "N";
460   } else {
461     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
462     csrmat = aijkok->csrmat;
463     mode   = "T";
464   }
465   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
466   PetscCall(VecRestoreKokkosView(xx, &xv));
467   PetscCall(VecRestoreKokkosView(yy, &yv));
468   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
469   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
470   PetscCall(PetscLogGpuTimeEnd());
471   PetscFunctionReturn(PETSC_SUCCESS);
472 }
473 
474 /* z = A^H x + y */
475 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
476 {
477   Mat_SeqAIJKokkos          *aijkok;
478   const char                *mode;
479   ConstPetscScalarKokkosView xv, yv;
480   PetscScalarKokkosView      zv;
481   KokkosCsrMatrix            csrmat;
482 
483   PetscFunctionBegin;
484   PetscCall(PetscLogGpuTimeBegin());
485   PetscCall(MatSeqAIJKokkosSyncDevice(A));
486   PetscCall(VecGetKokkosView(xx, &xv));
487   PetscCall(VecGetKokkosView(yy, &yv));
488   PetscCall(VecGetKokkosViewWrite(zz, &zv));
489   if (zz != yy) Kokkos::deep_copy(zv, yv);
490   if (A->form_explicit_transpose) {
491     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
492     mode = "N";
493   } else {
494     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
495     csrmat = aijkok->csrmat;
496     mode   = "C";
497   }
498   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
499   PetscCall(VecRestoreKokkosView(xx, &xv));
500   PetscCall(VecRestoreKokkosView(yy, &yv));
501   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
502   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
503   PetscCall(PetscLogGpuTimeEnd());
504   PetscFunctionReturn(PETSC_SUCCESS);
505 }
506 
507 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
508 {
509   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
510 
511   PetscFunctionBegin;
512   switch (op) {
513   case MAT_FORM_EXPLICIT_TRANSPOSE:
514     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
515     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
516     A->form_explicit_transpose = flg;
517     break;
518   default:
519     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
520     break;
521   }
522   PetscFunctionReturn(PETSC_SUCCESS);
523 }
524 
525 /* Depending on reuse, either build a new mat, or use the existing mat */
526 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
527 {
528   Mat_SeqAIJ *aseq;
529 
530   PetscFunctionBegin;
531   PetscCall(PetscKokkosInitializeCheck());
532   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
533     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
534   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
535     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
536   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
537     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
538     PetscCall(PetscFree(A->defaultvectype));
539     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
540     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
541     PetscCall(MatSetOps_SeqAIJKokkos(A));
542     aseq = static_cast<Mat_SeqAIJ *>(A->data);
543     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
544       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
545       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
546     }
547   }
548   PetscFunctionReturn(PETSC_SUCCESS);
549 }
550 
551 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
552    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
553  */
554 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
555 {
556   Mat_SeqAIJ       *bseq;
557   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
558   Mat               mat;
559 
560   PetscFunctionBegin;
561   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
562   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
563   mat = *B;
564   if (A->preallocated) {
565     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
566     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
567     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
568     /* Now copy values to B if needed */
569     if (dupOption == MAT_COPY_VALUES) {
570       if (akok->a_dual.need_sync_device()) {
571         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
572         bkok->a_dual.modify_host();
573       } else { /* If device has the latest data, we only copy data on device */
574         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
575         bkok->a_dual.modify_device();
576       }
577     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
578       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
579       bkok->a_dual.modify_host();
580     }
581     mat->spptr = bkok;
582   }
583 
584   PetscCall(PetscFree(mat->defaultvectype));
585   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
586   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
587   PetscCall(MatSetOps_SeqAIJKokkos(mat));
588   PetscFunctionReturn(PETSC_SUCCESS);
589 }
590 
591 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
592 {
593   Mat               At;
594   KokkosCsrMatrix   internT;
595   Mat_SeqAIJKokkos *atkok, *bkok;
596 
597   PetscFunctionBegin;
598   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
599   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
600   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
601     /* Deep copy internT, as we want to isolate the internal transpose */
602     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
603     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
604     if (reuse == MAT_INITIAL_MATRIX) *B = At;
605     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
606   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
607     if ((*B)->assembled) {
608       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
609       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
610       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
611     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
612       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
613       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
614       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
615       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
616       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
617     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
618   }
619   PetscFunctionReturn(PETSC_SUCCESS);
620 }
621 
622 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
623 {
624   Mat_SeqAIJKokkos *aijkok;
625 
626   PetscFunctionBegin;
627   if (A->factortype == MAT_FACTOR_NONE) {
628     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
629     delete aijkok;
630   } else {
631     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
632   }
633   A->spptr = NULL;
634   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
635   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
636   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
637   PetscCall(MatDestroy_SeqAIJ(A));
638   PetscFunctionReturn(PETSC_SUCCESS);
639 }
640 
641 /*MC
642    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
643 
644    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
645 
646    Options Database Key:
647 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
648 
649   Level: beginner
650 
651 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
652 M*/
653 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
654 {
655   PetscFunctionBegin;
656   PetscCall(PetscKokkosInitializeCheck());
657   PetscCall(MatCreate_SeqAIJ(A));
658   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
659   PetscFunctionReturn(PETSC_SUCCESS);
660 }
661 
662 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
663 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
664 {
665   Mat_SeqAIJ         *a, *b;
666   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
667   MatScalarKokkosView aa, ba, ca;
668   MatRowMapKokkosView ai, bi, ci;
669   MatColIdxKokkosView aj, bj, cj;
670   PetscInt            m, n, nnz, aN;
671 
672   PetscFunctionBegin;
673   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
674   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
675   PetscValidPointer(C, 4);
676   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
677   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
678   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
679   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
680 
681   PetscCall(MatSeqAIJKokkosSyncDevice(A));
682   PetscCall(MatSeqAIJKokkosSyncDevice(B));
683   a    = static_cast<Mat_SeqAIJ *>(A->data);
684   b    = static_cast<Mat_SeqAIJ *>(B->data);
685   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
686   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
687   aa   = akok->a_dual.view_device();
688   ai   = akok->i_dual.view_device();
689   ba   = bkok->a_dual.view_device();
690   bi   = bkok->i_dual.view_device();
691   m    = A->rmap->n; /* M, N and nnz of C */
692   n    = A->cmap->n + B->cmap->n;
693   nnz  = a->nz + b->nz;
694   aN   = A->cmap->n; /* N of A */
695   if (reuse == MAT_INITIAL_MATRIX) {
696     aj           = akok->j_dual.view_device();
697     bj           = bkok->j_dual.view_device();
698     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
699     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
700     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
701     ca           = ca_dual.view_device();
702     ci           = ci_dual.view_device();
703     cj           = cj_dual.view_device();
704 
705     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
706     Kokkos::parallel_for(
707       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
708         PetscInt i       = t.league_rank(); /* row i */
709         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
710 
711         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
712                                                    ci(i) = coffset;
713                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
714         });
715 
716         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
717           if (k < alen) {
718             ca(coffset + k) = aa(ai(i) + k);
719             cj(coffset + k) = aj(ai(i) + k);
720           } else {
721             ca(coffset + k) = ba(bi(i) + k - alen);
722             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
723           }
724         });
725       });
726     ca_dual.modify_device();
727     ci_dual.modify_device();
728     cj_dual.modify_device();
729     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
730     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
731   } else if (reuse == MAT_REUSE_MATRIX) {
732     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
733     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
734     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
735     ca   = ckok->a_dual.view_device();
736     ci   = ckok->i_dual.view_device();
737 
738     Kokkos::parallel_for(
739       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
740         PetscInt i    = t.league_rank(); /* row i */
741         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
742         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
743           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
744           else ca(ci(i) + k) = ba(bi(i) + k - alen);
745         });
746       });
747     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
748   }
749   PetscFunctionReturn(PETSC_SUCCESS);
750 }
751 
752 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
753 {
754   PetscFunctionBegin;
755   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
756   PetscFunctionReturn(PETSC_SUCCESS);
757 }
758 
759 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
760 {
761   Mat_Product                 *product = C->product;
762   Mat                          A, B;
763   bool                         transA, transB; /* use bool, since KK needs this type */
764   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
765   Mat_SeqAIJ                  *c;
766   MatProductData_SeqAIJKokkos *pdata;
767   KokkosCsrMatrix              csrmatA, csrmatB;
768 
769   PetscFunctionBegin;
770   MatCheckProduct(C, 1);
771   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
772   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
773 
774   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
775   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
776   // we still do numeric.
777   if (pdata->reusesym) { // numeric reuses results from symbolic
778     pdata->reusesym = PETSC_FALSE;
779     PetscFunctionReturn(PETSC_SUCCESS);
780   }
781 
782   switch (product->type) {
783   case MATPRODUCT_AB:
784     transA = false;
785     transB = false;
786     break;
787   case MATPRODUCT_AtB:
788     transA = true;
789     transB = false;
790     break;
791   case MATPRODUCT_ABt:
792     transA = false;
793     transB = true;
794     break;
795   default:
796     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
797   }
798 
799   A = product->A;
800   B = product->B;
801   PetscCall(MatSeqAIJKokkosSyncDevice(A));
802   PetscCall(MatSeqAIJKokkosSyncDevice(B));
803   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
804   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
805   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
806 
807   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
808 
809   csrmatA = akok->csrmat;
810   csrmatB = bkok->csrmat;
811 
812   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
813   if (transA) {
814     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
815     transA = false;
816   }
817 
818   if (transB) {
819     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
820     transB = false;
821   }
822   PetscCall(PetscLogGpuTimeBegin());
823   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
824 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
825   auto spgemmHandle = pdata->kh.get_spgemm_handle();
826   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
827 #endif
828 
829   PetscCall(PetscLogGpuTimeEnd());
830   PetscCall(MatSeqAIJKokkosModifyDevice(C));
831   /* shorter version of MatAssemblyEnd_SeqAIJ */
832   c = (Mat_SeqAIJ *)C->data;
833   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
834   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
835   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
836   c->reallocs         = 0;
837   C->info.mallocs     = 0;
838   C->info.nz_unneeded = 0;
839   C->assembled = C->was_assembled = PETSC_TRUE;
840   C->num_ass++;
841   PetscFunctionReturn(PETSC_SUCCESS);
842 }
843 
844 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
845 {
846   Mat_Product                 *product = C->product;
847   MatProductType               ptype;
848   Mat                          A, B;
849   bool                         transA, transB;
850   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
851   MatProductData_SeqAIJKokkos *pdata;
852   MPI_Comm                     comm;
853   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
854 
855   PetscFunctionBegin;
856   MatCheckProduct(C, 1);
857   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
858   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
859   A = product->A;
860   B = product->B;
861   PetscCall(MatSeqAIJKokkosSyncDevice(A));
862   PetscCall(MatSeqAIJKokkosSyncDevice(B));
863   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
864   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
865   csrmatA = akok->csrmat;
866   csrmatB = bkok->csrmat;
867 
868   ptype = product->type;
869   // Take advantage of the symmetry if true
870   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
871     ptype                                          = MATPRODUCT_AB;
872     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
873   }
874   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
875     ptype                                          = MATPRODUCT_AB;
876     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
877   }
878 
879   switch (ptype) {
880   case MATPRODUCT_AB:
881     transA = false;
882     transB = false;
883     break;
884   case MATPRODUCT_AtB:
885     transA = true;
886     transB = false;
887     break;
888   case MATPRODUCT_ABt:
889     transA = false;
890     transB = true;
891     break;
892   default:
893     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
894   }
895   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
896   pdata->reusesym = product->api_user;
897 
898   /* TODO: add command line options to select spgemm algorithms */
899   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
900 
901   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
902 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
903   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
904   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
905   #endif
906 #endif
907   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
908 
909   PetscCall(PetscLogGpuTimeBegin());
910   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
911   if (transA) {
912     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
913     transA = false;
914   }
915 
916   if (transB) {
917     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
918     transB = false;
919   }
920 
921   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
922   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
923     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
924     calling new Mat_SeqAIJKokkos().
925     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
926   */
927   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
928 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
929   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
930   auto spgemmHandle = pdata->kh.get_spgemm_handle();
931   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
932 #endif
933   PetscCall(PetscLogGpuTimeEnd());
934 
935   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
936   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
937   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
938   PetscFunctionReturn(PETSC_SUCCESS);
939 }
940 
941 /* handles sparse matrix matrix ops */
942 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
943 {
944   Mat_Product *product = mat->product;
945   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
946 
947   PetscFunctionBegin;
948   MatCheckProduct(mat, 1);
949   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
950   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
951   if (Biskok && Ciskok) {
952     switch (product->type) {
953     case MATPRODUCT_AB:
954     case MATPRODUCT_AtB:
955     case MATPRODUCT_ABt:
956       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
957       break;
958     case MATPRODUCT_PtAP:
959     case MATPRODUCT_RARt:
960     case MATPRODUCT_ABC:
961       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
962       break;
963     default:
964       break;
965     }
966   } else { /* fallback for AIJ */
967     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
968   }
969   PetscFunctionReturn(PETSC_SUCCESS);
970 }
971 
972 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
973 {
974   Mat_SeqAIJKokkos *aijkok;
975 
976   PetscFunctionBegin;
977   PetscCall(PetscLogGpuTimeBegin());
978   PetscCall(MatSeqAIJKokkosSyncDevice(A));
979   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
980   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
981   PetscCall(MatSeqAIJKokkosModifyDevice(A));
982   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
983   PetscCall(PetscLogGpuTimeEnd());
984   PetscFunctionReturn(PETSC_SUCCESS);
985 }
986 
987 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
988 {
989   Mat_SeqAIJKokkos *aijkok;
990 
991   PetscFunctionBegin;
992   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
993   if (aijkok) { /* Only zero the device if data is already there */
994     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
995     PetscCall(MatSeqAIJKokkosModifyDevice(A));
996   } else { /* Might be preallocated but not assembled */
997     PetscCall(MatZeroEntries_SeqAIJ(A));
998   }
999   PetscFunctionReturn(PETSC_SUCCESS);
1000 }
1001 
1002 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1003 {
1004   Mat_SeqAIJ           *aijseq;
1005   Mat_SeqAIJKokkos     *aijkok;
1006   PetscInt              n;
1007   PetscScalarKokkosView xv;
1008 
1009   PetscFunctionBegin;
1010   PetscCall(VecGetLocalSize(x, &n));
1011   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1012   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1013 
1014   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1015   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1016 
1017   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
1018     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1019     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1020     aijkok->SetDiagonal(aijseq->diag);
1021   }
1022 
1023   const auto &Aa    = aijkok->a_dual.view_device();
1024   const auto &Ai    = aijkok->i_dual.view_device();
1025   const auto &Adiag = aijkok->diag_dual.view_device();
1026 
1027   PetscCall(VecGetKokkosViewWrite(x, &xv));
1028   Kokkos::parallel_for(
1029     n, KOKKOS_LAMBDA(const PetscInt i) {
1030       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1031       else xv(i) = 0;
1032     });
1033   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1034   PetscFunctionReturn(PETSC_SUCCESS);
1035 }
1036 
1037 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1038 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1039 {
1040   Mat_SeqAIJKokkos *aijkok;
1041 
1042   PetscFunctionBegin;
1043   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1044   PetscValidPointer(kv, 2);
1045   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1046   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1047   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1048   *kv    = aijkok->a_dual.view_device();
1049   PetscFunctionReturn(PETSC_SUCCESS);
1050 }
1051 
1052 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1053 {
1054   PetscFunctionBegin;
1055   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1056   PetscValidPointer(kv, 2);
1057   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1058   PetscFunctionReturn(PETSC_SUCCESS);
1059 }
1060 
1061 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1062 {
1063   Mat_SeqAIJKokkos *aijkok;
1064 
1065   PetscFunctionBegin;
1066   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1067   PetscValidPointer(kv, 2);
1068   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1069   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1070   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1071   *kv    = aijkok->a_dual.view_device();
1072   PetscFunctionReturn(PETSC_SUCCESS);
1073 }
1074 
1075 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1076 {
1077   PetscFunctionBegin;
1078   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1079   PetscValidPointer(kv, 2);
1080   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1081   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1082   PetscFunctionReturn(PETSC_SUCCESS);
1083 }
1084 
1085 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1086 {
1087   Mat_SeqAIJKokkos *aijkok;
1088 
1089   PetscFunctionBegin;
1090   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1091   PetscValidPointer(kv, 2);
1092   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1093   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1094   *kv    = aijkok->a_dual.view_device();
1095   PetscFunctionReturn(PETSC_SUCCESS);
1096 }
1097 
1098 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1099 {
1100   PetscFunctionBegin;
1101   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1102   PetscValidPointer(kv, 2);
1103   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1104   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1105   PetscFunctionReturn(PETSC_SUCCESS);
1106 }
1107 
1108 /* Computes Y += alpha X */
1109 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1110 {
1111   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1112   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1113   ConstMatScalarKokkosView Xa;
1114   MatScalarKokkosView      Ya;
1115 
1116   PetscFunctionBegin;
1117   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1118   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1119   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1120   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1121   PetscCall(PetscLogGpuTimeBegin());
1122 
1123   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1124     PetscBool e;
1125     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1126     if (e) {
1127       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1128       if (e) pattern = SAME_NONZERO_PATTERN;
1129     }
1130   }
1131 
1132   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1133     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1134     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1135     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1136   */
1137   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1138   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1139   Xa   = xkok->a_dual.view_device();
1140   Ya   = ykok->a_dual.view_device();
1141 
1142   if (pattern == SAME_NONZERO_PATTERN) {
1143     KokkosBlas::axpy(alpha, Xa, Ya);
1144     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1145   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1146     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1147     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1148 
1149     Kokkos::parallel_for(
1150       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1151         PetscInt i = t.league_rank(); // row i
1152         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1153           // Only one thread works in a team
1154           PetscInt p, q = Yi(i);
1155           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1156             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1157             if (Xj(p) == Yj(q)) {                        // Found it
1158               Ya(q) += alpha * Xa(p);
1159               q++;
1160             } else {
1161             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1162             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1163 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1164               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1165 #else
1166               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1167 #endif
1168             }
1169           }
1170         });
1171       });
1172     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1173   } else { // different nonzero patterns
1174     Mat             Z;
1175     KokkosCsrMatrix zcsr;
1176     KernelHandle    kh;
1177     kh.create_spadd_handle(true); // X, Y are sorted
1178     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1179     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1180     zkok = new Mat_SeqAIJKokkos(zcsr);
1181     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1182     PetscCall(MatHeaderReplace(Y, &Z));
1183     kh.destroy_spadd_handle();
1184   }
1185   PetscCall(PetscLogGpuTimeEnd());
1186   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1187   PetscFunctionReturn(PETSC_SUCCESS);
1188 }
1189 
1190 struct MatCOOStruct_SeqAIJKokkos {
1191   PetscCount           n;
1192   PetscCount           Atot;
1193   PetscInt             nz;
1194   PetscCountKokkosView jmap;
1195   PetscCountKokkosView perm;
1196 
1197   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1198   {
1199     nz   = coo_h->nz;
1200     n    = coo_h->n;
1201     Atot = coo_h->Atot;
1202     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1203     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1204   }
1205 };
1206 
1207 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
1208 {
1209   PetscFunctionBegin;
1210   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
1211   PetscFunctionReturn(PETSC_SUCCESS);
1212 }
1213 
1214 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1215 {
1216   Mat_SeqAIJKokkos          *akok;
1217   Mat_SeqAIJ                *aseq;
1218   PetscContainer             container_h, container_d;
1219   MatCOOStruct_SeqAIJ       *coo_h;
1220   MatCOOStruct_SeqAIJKokkos *coo_d;
1221 
1222   PetscFunctionBegin;
1223   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1224   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1225   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1226   delete akok;
1227   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1228   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1229 
1230   // Copy the COO struct to device
1231   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1232   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1233   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1234 
1235   // Put the COO struct in a container and then attach that to the matrix
1236   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
1237   PetscCall(PetscContainerSetPointer(container_d, coo_d));
1238   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos));
1239   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
1240   PetscCall(PetscContainerDestroy(&container_d));
1241   PetscFunctionReturn(PETSC_SUCCESS);
1242 }
1243 
1244 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1245 {
1246   MatScalarKokkosView        Aa;
1247   ConstMatScalarKokkosView   kv;
1248   PetscMemType               memtype;
1249   PetscContainer             container;
1250   MatCOOStruct_SeqAIJKokkos *coo;
1251 
1252   PetscFunctionBegin;
1253   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1254   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1255 
1256   const auto &n    = coo->n;
1257   const auto &Annz = coo->nz;
1258   const auto &jmap = coo->jmap;
1259   const auto &perm = coo->perm;
1260 
1261   PetscCall(PetscGetMemType(v, &memtype));
1262   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1263     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1264   } else {
1265     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1266   }
1267 
1268   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1269   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1270 
1271   Kokkos::parallel_for(
1272     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1273       PetscScalar sum = 0.0;
1274       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1275       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1276     });
1277 
1278   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1279   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1280   PetscFunctionReturn(PETSC_SUCCESS);
1281 }
1282 
1283 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1284 {
1285   PetscFunctionBegin;
1286   PetscCall(MatSeqAIJKokkosSyncHost(A));
1287   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1288   B->offloadmask = PETSC_OFFLOAD_CPU;
1289   PetscFunctionReturn(PETSC_SUCCESS);
1290 }
1291 
1292 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1293 {
1294   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1295 
1296   PetscFunctionBegin;
1297   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1298   A->boundtocpu  = PETSC_FALSE;
1299 
1300   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1301   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1302   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1303   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1304   A->ops->scale                     = MatScale_SeqAIJKokkos;
1305   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1306   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1307   A->ops->mult                      = MatMult_SeqAIJKokkos;
1308   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1309   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1310   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1311   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1312   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1313   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1314   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1315   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1316   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1317   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1318   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1319   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1320   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1321   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1322   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1323   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1324 
1325   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1326   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1327   PetscFunctionReturn(PETSC_SUCCESS);
1328 }
1329 
1330 /*
1331    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1332 
1333   Input Parameters:
1334 +  A       - the MATSEQAIJKOKKOS matrix
1335 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1336 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1337 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1338 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1339 
1340   Output Parameter:
1341 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1342 */
1343 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1344 {
1345   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1346   PetscInt          N       = A->rmap->n;
1347   PetscInt          nblocks = bs.extent(0) - 1;
1348 
1349   PetscFunctionBegin;
1350   // Set the diagonal pointer on device if not already
1351   if (N && akok->diag_dual.extent(0) == 0) {
1352     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1353     akok->SetDiagonal(static_cast<Mat_SeqAIJ *>(A->data)->diag);
1354   }
1355 
1356   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1357 
1358   // Pull out the diagonal blocks of the matrix and then invert the blocks
1359   auto Aa    = akok->a_dual.view_device();
1360   auto Ai    = akok->i_dual.view_device();
1361   auto Aj    = akok->j_dual.view_device();
1362   auto Adiag = akok->diag_dual.view_device();
1363   // TODO: how to tune the team size?
1364 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
1365   auto ts = Kokkos::AUTO();
1366 #else
1367   auto ts         = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1368 #endif
1369   PetscCallCXX(Kokkos::parallel_for(
1370     Kokkos::TeamPolicy<>(nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1371       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1372       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1373       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1374       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1375       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1376 
1377       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1378         PetscInt i = rstart + r;                                                            // i-th row in A
1379 
1380         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1381           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1382 
1383           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1384             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1385               B(r, c) = 0.0;
1386             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1387               B(r, c) = Aa(first + c);
1388             } else { // this entry does not show up in the CSR
1389               B(r, c) = 0.0;
1390             }
1391           }
1392         } else { // rare case that the diagonal does not exist
1393           const PetscInt begin = Ai(i);
1394           const PetscInt end   = Ai(i + 1);
1395           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1396           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1397             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1398             else if (Aj(j) >= rstart + m) break;
1399           }
1400         }
1401       });
1402 
1403       // LU-decompose B (w/o pivoting) and then invert B
1404       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1405       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1406     }));
1407   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1408   PetscFunctionReturn(PETSC_SUCCESS);
1409 }
1410 
1411 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1412 {
1413   Mat_SeqAIJ *aseq;
1414   PetscInt    i, m, n;
1415 
1416   PetscFunctionBegin;
1417   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1418 
1419   m = akok->nrows();
1420   n = akok->ncols();
1421   PetscCall(MatSetSizes(A, m, n, m, n));
1422   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1423 
1424   /* Set up data structures of A as a MATSEQAIJ */
1425   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1426   aseq = (Mat_SeqAIJ *)(A)->data;
1427 
1428   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1429   akok->j_dual.sync_host();
1430 
1431   aseq->i            = akok->i_host_data();
1432   aseq->j            = akok->j_host_data();
1433   aseq->a            = akok->a_host_data();
1434   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1435   aseq->singlemalloc = PETSC_FALSE;
1436   aseq->free_a       = PETSC_FALSE;
1437   aseq->free_ij      = PETSC_FALSE;
1438   aseq->nz           = akok->nnz();
1439   aseq->maxnz        = aseq->nz;
1440 
1441   PetscCall(PetscMalloc1(m, &aseq->imax));
1442   PetscCall(PetscMalloc1(m, &aseq->ilen));
1443   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1444 
1445   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1446   akok->nonzerostate = A->nonzerostate;
1447   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1448   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1449   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1450   PetscFunctionReturn(PETSC_SUCCESS);
1451 }
1452 
1453 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1454 {
1455   PetscFunctionBegin;
1456   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1457   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1458   PetscFunctionReturn(PETSC_SUCCESS);
1459 }
1460 
1461 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1462 {
1463   Mat_SeqAIJKokkos *akok;
1464   PetscFunctionBegin;
1465   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1466   PetscCall(MatCreate(comm, A));
1467   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1468   PetscFunctionReturn(PETSC_SUCCESS);
1469 }
1470 
1471 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1472 
1473    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1474  */
1475 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1476 {
1477   PetscFunctionBegin;
1478   PetscCall(MatCreate(comm, A));
1479   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1480   PetscFunctionReturn(PETSC_SUCCESS);
1481 }
1482 
1483 /*@C
1484    MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1485    (the default parallel PETSc format). This matrix will ultimately be handled by
1486    Kokkos for calculations.
1487 
1488    Collective
1489 
1490    Input Parameters:
1491 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
1492 .  m - number of rows
1493 .  n - number of columns
1494 .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1495 -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1496 
1497    Output Parameter:
1498 .  A - the matrix
1499 
1500    Level: intermediate
1501 
1502    Notes:
1503    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1504    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1505    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1506 
1507    The AIJ format, also called
1508    compressed row storage, is fully compatible with standard Fortran
1509    storage.  That is, the stored row and column indices can begin at
1510    either one (as in Fortran) or zero.
1511 
1512    Specify the preallocated storage with either `nz` or `nnz` (not both).
1513    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1514    allocation.
1515 
1516 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1517 @*/
1518 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1519 {
1520   PetscFunctionBegin;
1521   PetscCall(PetscKokkosInitializeCheck());
1522   PetscCall(MatCreate(comm, A));
1523   PetscCall(MatSetSizes(*A, m, n, m, n));
1524   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1525   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1526   PetscFunctionReturn(PETSC_SUCCESS);
1527 }
1528 
1529 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1530 {
1531   PetscFunctionBegin;
1532   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1533   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1534   PetscFunctionReturn(PETSC_SUCCESS);
1535 }
1536 
1537 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1538 {
1539   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1540 
1541   PetscFunctionBegin;
1542   if (!factors->sptrsv_symbolic_completed) {
1543     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1544     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1545     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1546   }
1547   PetscFunctionReturn(PETSC_SUCCESS);
1548 }
1549 
1550 /* Check if we need to update factors etc for transpose solve */
1551 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1552 {
1553   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1554   MatColIdxType               n       = A->rmap->n;
1555 
1556   PetscFunctionBegin;
1557   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1558     /* Update L^T and do sptrsv symbolic */
1559     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1560     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1561     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1562 
1563     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1564                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1565 
1566     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1567       We have to sort the indices, until KK provides finer control options.
1568     */
1569     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1570 
1571     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1572 
1573     /* Update U^T and do sptrsv symbolic */
1574     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1575     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1576     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1577 
1578     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1579                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1580 
1581     /* Sort indices. See comments above */
1582     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1583 
1584     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1585     factors->transpose_updated = PETSC_TRUE;
1586   }
1587   PetscFunctionReturn(PETSC_SUCCESS);
1588 }
1589 
1590 /* Solve Ax = b, with A = LU */
1591 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1592 {
1593   ConstPetscScalarKokkosView  bv;
1594   PetscScalarKokkosView       xv;
1595   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1596 
1597   PetscFunctionBegin;
1598   PetscCall(PetscLogGpuTimeBegin());
1599   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1600   PetscCall(VecGetKokkosView(b, &bv));
1601   PetscCall(VecGetKokkosViewWrite(x, &xv));
1602   /* Solve L tmpv = b */
1603   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1604   /* Solve Ux = tmpv */
1605   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1606   PetscCall(VecRestoreKokkosView(b, &bv));
1607   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1608   PetscCall(PetscLogGpuTimeEnd());
1609   PetscFunctionReturn(PETSC_SUCCESS);
1610 }
1611 
1612 /* Solve A^T x = b, where A^T = U^T L^T */
1613 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1614 {
1615   ConstPetscScalarKokkosView  bv;
1616   PetscScalarKokkosView       xv;
1617   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1618 
1619   PetscFunctionBegin;
1620   PetscCall(PetscLogGpuTimeBegin());
1621   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1622   PetscCall(VecGetKokkosView(b, &bv));
1623   PetscCall(VecGetKokkosViewWrite(x, &xv));
1624   /* Solve U^T tmpv = b */
1625   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1626 
1627   /* Solve L^T x = tmpv */
1628   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1629   PetscCall(VecRestoreKokkosView(b, &bv));
1630   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1631   PetscCall(PetscLogGpuTimeEnd());
1632   PetscFunctionReturn(PETSC_SUCCESS);
1633 }
1634 
1635 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1636 {
1637   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1638   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1639   PetscInt                    fill_lev = info->levels;
1640 
1641   PetscFunctionBegin;
1642   PetscCall(PetscLogGpuTimeBegin());
1643   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1644 
1645   auto a_d = aijkok->a_dual.view_device();
1646   auto i_d = aijkok->i_dual.view_device();
1647   auto j_d = aijkok->j_dual.view_device();
1648 
1649   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1650 
1651   B->assembled              = PETSC_TRUE;
1652   B->preallocated           = PETSC_TRUE;
1653   B->ops->solve             = MatSolve_SeqAIJKokkos;
1654   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1655   B->ops->matsolve          = NULL;
1656   B->ops->matsolvetranspose = NULL;
1657   B->offloadmask            = PETSC_OFFLOAD_GPU;
1658 
1659   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1660   factors->transpose_updated         = PETSC_FALSE;
1661   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1662   /* TODO: log flops, but how to know that? */
1663   PetscCall(PetscLogGpuTimeEnd());
1664   PetscFunctionReturn(PETSC_SUCCESS);
1665 }
1666 
1667 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1668 {
1669   Mat_SeqAIJKokkos           *aijkok;
1670   Mat_SeqAIJ                 *b;
1671   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1672   PetscInt                    fill_lev = info->levels;
1673   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1674   PetscInt                    n        = A->rmap->n;
1675 
1676   PetscFunctionBegin;
1677   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1678   /* Rebuild factors */
1679   if (factors) {
1680     factors->Destroy();
1681   } /* Destroy the old if it exists */
1682   else {
1683     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1684   }
1685 
1686   /* Create a spiluk handle and then do symbolic factorization */
1687   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1688   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1689 
1690   auto spiluk_handle = factors->kh.get_spiluk_handle();
1691 
1692   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1693   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1694   Kokkos::realloc(factors->iU_d, n + 1);
1695   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1696 
1697   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1698   auto i_d = aijkok->i_dual.view_device();
1699   auto j_d = aijkok->j_dual.view_device();
1700   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1701   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1702 
1703   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1704   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1705   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1706   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1707 
1708   /* TODO: add options to select sptrsv algorithms */
1709   /* Create sptrsv handles for L, U and their transpose */
1710 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1711   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1712 #else
1713   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1714 #endif
1715 
1716   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1717   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1718   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1719   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1720 
1721   /* Fill fields of the factor matrix B */
1722   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1723   b     = (Mat_SeqAIJ *)B->data;
1724   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1725   B->info.fill_ratio_given  = info->fill;
1726   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1727 
1728   B->offloadmask          = PETSC_OFFLOAD_GPU;
1729   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1730   PetscFunctionReturn(PETSC_SUCCESS);
1731 }
1732 
1733 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1734 {
1735   PetscFunctionBegin;
1736   *type = MATSOLVERKOKKOS;
1737   PetscFunctionReturn(PETSC_SUCCESS);
1738 }
1739 
1740 /*MC
1741   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1742   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1743 
1744   Level: beginner
1745 
1746 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1747 M*/
1748 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1749 {
1750   PetscInt n = A->rmap->n;
1751 
1752   PetscFunctionBegin;
1753   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1754   PetscCall(MatSetSizes(*B, n, n, n, n));
1755   (*B)->factortype = ftype;
1756   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1757   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1758 
1759   if (ftype == MAT_FACTOR_LU) {
1760     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1761     (*B)->canuseordering        = PETSC_TRUE;
1762     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1763   } else if (ftype == MAT_FACTOR_ILU) {
1764     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1765     (*B)->canuseordering         = PETSC_FALSE;
1766     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1767   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1768 
1769   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1770   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1771   PetscFunctionReturn(PETSC_SUCCESS);
1772 }
1773 
1774 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1775 {
1776   PetscFunctionBegin;
1777   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1778   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1779   PetscFunctionReturn(PETSC_SUCCESS);
1780 }
1781 
1782 /* Utility to print out a KokkosCsrMatrix for debugging */
1783 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1784 {
1785   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1786   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1787   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1788   const PetscInt    *i  = iv.data();
1789   const PetscInt    *j  = jv.data();
1790   const PetscScalar *a  = av.data();
1791   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1792 
1793   PetscFunctionBegin;
1794   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1795   for (PetscInt k = 0; k < m; k++) {
1796     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1797     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1798     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1799   }
1800   PetscFunctionReturn(PETSC_SUCCESS);
1801 }
1802