xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 343dd72a324e809d9716ced9816d264bfa60d2bb)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20 #include <KokkosSparse_spmv.hpp>
21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22 
23 #include <KokkosSparse_spiluk.hpp>
24 #include <KokkosSparse_sptrsv.hpp>
25 #include <KokkosSparse_spgemm.hpp>
26 #include <KokkosSparse_spadd.hpp>
27 #include <KokkosBatched_LU_Decl.hpp>
28 #include <KokkosBatched_InverseLU_Decl.hpp>
29 
30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31 
32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33   #include <KokkosSparse_Utils.hpp>
34 using KokkosSparse::sort_crs_matrix;
35 using KokkosSparse::Impl::transpose_matrix;
36 #else
37   #include <KokkosKernels_Sorting.hpp>
38 using KokkosKernels::sort_crs_matrix;
39 using KokkosKernels::Impl::transpose_matrix;
40 #endif
41 
42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43 using KokkosSparse::spiluk_symbolic;
44 using KokkosSparse::spiluk_numeric;
45 using KokkosSparse::sptrsv_symbolic;
46 using KokkosSparse::sptrsv_solve;
47 using KokkosSparse::Experimental::SPTRSVAlgorithm;
48 using KokkosSparse::Experimental::SPILUKAlgorithm;
49 #else
50 using KokkosSparse::Experimental::spiluk_symbolic;
51 using KokkosSparse::Experimental::spiluk_numeric;
52 using KokkosSparse::Experimental::sptrsv_symbolic;
53 using KokkosSparse::Experimental::sptrsv_solve;
54 using KokkosSparse::Experimental::SPTRSVAlgorithm;
55 using KokkosSparse::Experimental::SPILUKAlgorithm;
56 #endif
57 
58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59 
60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62    In the latter case, it is important to set a_dual's sync state correctly.
63  */
64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65 {
66   Mat_SeqAIJ       *aijseq;
67   Mat_SeqAIJKokkos *aijkok;
68 
69   PetscFunctionBegin;
70   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72 
73   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75 
76   /* If aijkok does not exist, we just copy i, j to device.
77      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78      In both cases, we build a new aijkok structure.
79   */
80   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidently freeing much needed a,i,j on host when deleting aijkok */
82       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
83       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
84       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
85       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
86       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
87       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
88       aijseq->free_a  = PETSC_TRUE;
89       aijseq->free_ij = PETSC_TRUE;
90       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
91          we have the CsrMatrix on device first and then copy to host, followed by
92          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
93          One could improve it by not using NULL garray.
94       */
95     }
96     delete aijkok;
97     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
98     A->spptr = aijkok;
99   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
100     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
101     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
102     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
103   }
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 
107 /* Sync CSR data to device if not yet */
108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111 
112   PetscFunctionBegin;
113   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
114   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
115   if (aijkok->a_dual.need_sync_device()) {
116     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
117     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
118     aijkok->hermitian_updated = PETSC_FALSE;
119   }
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 /* Mark the CSR data on device as modified */
124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127 
128   PetscFunctionBegin;
129   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
130   aijkok->a_dual.clear_sync_state();
131   aijkok->a_dual.modify_device();
132   aijkok->transpose_updated = PETSC_FALSE;
133   aijkok->hermitian_updated = PETSC_FALSE;
134   PetscCall(MatSeqAIJInvalidateDiagonal(A));
135   PetscCall(PetscObjectStateIncrease((PetscObject)A));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140 {
141   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142   auto              exec   = PetscGetKokkosExecutionSpace();
143 
144   PetscFunctionBegin;
145   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
146   /* We do not expect one needs factors on host  */
147   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
148   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
150   PetscFunctionReturn(PETSC_SUCCESS);
151 }
152 
153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154 {
155   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
156 
157   PetscFunctionBegin;
158   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
159     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
160     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
161     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
162   */
163   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
165     *array = aijkok->a_dual.view_host().data();
166   } else { /* Happens when calling MatSetValues on a newly created matrix */
167     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
168   }
169   PetscFunctionReturn(PETSC_SUCCESS);
170 }
171 
172 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
173 {
174   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
175 
176   PetscFunctionBegin;
177   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
178   PetscFunctionReturn(PETSC_SUCCESS);
179 }
180 
181 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
182 {
183   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
184 
185   PetscFunctionBegin;
186   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
187     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
188     *array = aijkok->a_dual.view_host().data();
189   } else {
190     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
191   }
192   PetscFunctionReturn(PETSC_SUCCESS);
193 }
194 
195 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
196 {
197   PetscFunctionBegin;
198   *array = NULL;
199   PetscFunctionReturn(PETSC_SUCCESS);
200 }
201 
202 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
203 {
204   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
205 
206   PetscFunctionBegin;
207   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
208     *array = aijkok->a_dual.view_host().data();
209   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
210     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
211   }
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 
215 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
216 {
217   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
218 
219   PetscFunctionBegin;
220   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
221     aijkok->a_dual.clear_sync_state();
222     aijkok->a_dual.modify_host();
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
228 {
229   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
230 
231   PetscFunctionBegin;
232   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
233 
234   if (i) *i = aijkok->i_device_data();
235   if (j) *j = aijkok->j_device_data();
236   if (a) {
237     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
238     *a = aijkok->a_device_data();
239   }
240   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243 
244 /*
245   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
246 
247   Input Parameter:
248 .  A       - the MATSEQAIJKOKKOS matrix
249 
250   Output Parameters:
251 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
252 -  T_d    - the transpose on device, whose value array is allocated but not initialized
253 */
254 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
255 {
256   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
257   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
258   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
259   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
260   MatRowMapType          *Ti = Ti_h.data();
261   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
262   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
263   PetscInt               *Tj   = Tj_h.data();
264   PetscInt               *perm = perm_h.data();
265   PetscInt               *offset;
266 
267   PetscFunctionBegin;
268   // Populate Ti
269   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
270   Ti++;
271   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
272   Ti--;
273   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
274 
275   // Populate Tj and the permutation array
276   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
277   for (PetscInt i = 0; i < m; i++) {
278     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
279       PetscInt r    = Aj[j];                       // row r of T
280       PetscInt disp = Ti[r] + offset[r];
281 
282       Tj[disp]   = i; // col i of T
283       perm[disp] = j;
284       offset[r]++;
285     }
286   }
287   PetscCall(PetscFree(offset));
288 
289   // Sort each row of T, along with the permutation array
290   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
291 
292   // Output perm and T on device
293   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
294   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
295   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
296   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
297   PetscFunctionReturn(PETSC_SUCCESS);
298 }
299 
300 // Generate the transpose on device and cache it internally
301 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
302 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
303 {
304   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
305   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
306   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
307   KokkosCsrMatrix  &T = akok->csrmatT;
308 
309   PetscFunctionBegin;
310   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
311   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
312 
313   const auto &Aa = akok->a_dual.view_device();
314 
315   if (A->symmetric == PETSC_BOOL3_TRUE) {
316     *csrmatT = akok->csrmat;
317   } else {
318     // See if we already have a cached transpose and its value is up to date
319     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
320       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
321         const auto &perm = akok->transpose_perm; // get the permutation array
322         auto       &Ta   = T.values;
323 
324         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
325       }
326     } else { // Generate T of size n x m for the first time
327       MatRowMapKokkosView perm;
328 
329       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
330       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
331       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
332     }
333     akok->transpose_updated = PETSC_TRUE;
334     *csrmatT                = akok->csrmatT;
335   }
336   PetscFunctionReturn(PETSC_SUCCESS);
337 }
338 
339 // Generate the Hermitian on device and cache it internally
340 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
341 {
342   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
343   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
344   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
345   KokkosCsrMatrix  &T = akok->csrmatH;
346 
347   PetscFunctionBegin;
348   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
349   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
350 
351   const auto &Aa = akok->a_dual.view_device();
352 
353   if (A->hermitian == PETSC_BOOL3_TRUE) {
354     *csrmatH = akok->csrmat;
355   } else {
356     // See if we already have a cached hermitian and its value is up to date
357     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
358       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
359         const auto &perm = akok->transpose_perm; // get the permutation array
360         auto       &Ta   = T.values;
361 
362         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
363       }
364     } else { // Generate T of size n x m for the first time
365       MatRowMapKokkosView perm;
366 
367       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
368       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
369       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
370     }
371     akok->hermitian_updated = PETSC_TRUE;
372     *csrmatH                = akok->csrmatH;
373   }
374   PetscFunctionReturn(PETSC_SUCCESS);
375 }
376 
377 /* y = A x */
378 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
379 {
380   Mat_SeqAIJKokkos          *aijkok;
381   ConstPetscScalarKokkosView xv;
382   PetscScalarKokkosView      yv;
383 
384   PetscFunctionBegin;
385   PetscCall(PetscLogGpuTimeBegin());
386   PetscCall(MatSeqAIJKokkosSyncDevice(A));
387   PetscCall(VecGetKokkosView(xx, &xv));
388   PetscCall(VecGetKokkosViewWrite(yy, &yv));
389   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
390   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
391   PetscCall(VecRestoreKokkosView(xx, &xv));
392   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
393   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
394   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
395   PetscCall(PetscLogGpuTimeEnd());
396   PetscFunctionReturn(PETSC_SUCCESS);
397 }
398 
399 /* y = A^T x */
400 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
401 {
402   Mat_SeqAIJKokkos          *aijkok;
403   const char                *mode;
404   ConstPetscScalarKokkosView xv;
405   PetscScalarKokkosView      yv;
406   KokkosCsrMatrix            csrmat;
407 
408   PetscFunctionBegin;
409   PetscCall(PetscLogGpuTimeBegin());
410   PetscCall(MatSeqAIJKokkosSyncDevice(A));
411   PetscCall(VecGetKokkosView(xx, &xv));
412   PetscCall(VecGetKokkosViewWrite(yy, &yv));
413   if (A->form_explicit_transpose) {
414     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
415     mode = "N";
416   } else {
417     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
418     csrmat = aijkok->csrmat;
419     mode   = "T";
420   }
421   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
422   PetscCall(VecRestoreKokkosView(xx, &xv));
423   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
424   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
425   PetscCall(PetscLogGpuTimeEnd());
426   PetscFunctionReturn(PETSC_SUCCESS);
427 }
428 
429 /* y = A^H x */
430 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
431 {
432   Mat_SeqAIJKokkos          *aijkok;
433   const char                *mode;
434   ConstPetscScalarKokkosView xv;
435   PetscScalarKokkosView      yv;
436   KokkosCsrMatrix            csrmat;
437 
438   PetscFunctionBegin;
439   PetscCall(PetscLogGpuTimeBegin());
440   PetscCall(MatSeqAIJKokkosSyncDevice(A));
441   PetscCall(VecGetKokkosView(xx, &xv));
442   PetscCall(VecGetKokkosViewWrite(yy, &yv));
443   if (A->form_explicit_transpose) {
444     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
445     mode = "N";
446   } else {
447     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
448     csrmat = aijkok->csrmat;
449     mode   = "C";
450   }
451   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
452   PetscCall(VecRestoreKokkosView(xx, &xv));
453   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
454   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
455   PetscCall(PetscLogGpuTimeEnd());
456   PetscFunctionReturn(PETSC_SUCCESS);
457 }
458 
459 /* z = A x + y */
460 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
461 {
462   Mat_SeqAIJKokkos          *aijkok;
463   ConstPetscScalarKokkosView xv;
464   PetscScalarKokkosView      zv;
465 
466   PetscFunctionBegin;
467   PetscCall(PetscLogGpuTimeBegin());
468   PetscCall(MatSeqAIJKokkosSyncDevice(A));
469   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
470   PetscCall(VecGetKokkosView(xx, &xv));
471   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
472   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
473   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
474   PetscCall(VecRestoreKokkosView(xx, &xv));
475   PetscCall(VecRestoreKokkosView(zz, &zv));
476   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
477   PetscCall(PetscLogGpuTimeEnd());
478   PetscFunctionReturn(PETSC_SUCCESS);
479 }
480 
481 /* z = A^T x + y */
482 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
483 {
484   Mat_SeqAIJKokkos          *aijkok;
485   const char                *mode;
486   ConstPetscScalarKokkosView xv;
487   PetscScalarKokkosView      zv;
488   KokkosCsrMatrix            csrmat;
489 
490   PetscFunctionBegin;
491   PetscCall(PetscLogGpuTimeBegin());
492   PetscCall(MatSeqAIJKokkosSyncDevice(A));
493   if (zz != yy) PetscCall(VecCopy(yy, zz));
494   PetscCall(VecGetKokkosView(xx, &xv));
495   PetscCall(VecGetKokkosView(zz, &zv));
496   if (A->form_explicit_transpose) {
497     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
498     mode = "N";
499   } else {
500     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
501     csrmat = aijkok->csrmat;
502     mode   = "T";
503   }
504   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
505   PetscCall(VecRestoreKokkosView(xx, &xv));
506   PetscCall(VecRestoreKokkosView(zz, &zv));
507   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
508   PetscCall(PetscLogGpuTimeEnd());
509   PetscFunctionReturn(PETSC_SUCCESS);
510 }
511 
512 /* z = A^H x + y */
513 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
514 {
515   Mat_SeqAIJKokkos          *aijkok;
516   const char                *mode;
517   ConstPetscScalarKokkosView xv;
518   PetscScalarKokkosView      zv;
519   KokkosCsrMatrix            csrmat;
520 
521   PetscFunctionBegin;
522   PetscCall(PetscLogGpuTimeBegin());
523   PetscCall(MatSeqAIJKokkosSyncDevice(A));
524   if (zz != yy) PetscCall(VecCopy(yy, zz));
525   PetscCall(VecGetKokkosView(xx, &xv));
526   PetscCall(VecGetKokkosView(zz, &zv));
527   if (A->form_explicit_transpose) {
528     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
529     mode = "N";
530   } else {
531     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
532     csrmat = aijkok->csrmat;
533     mode   = "C";
534   }
535   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
536   PetscCall(VecRestoreKokkosView(xx, &xv));
537   PetscCall(VecRestoreKokkosView(zz, &zv));
538   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
539   PetscCall(PetscLogGpuTimeEnd());
540   PetscFunctionReturn(PETSC_SUCCESS);
541 }
542 
543 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
544 {
545   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
546 
547   PetscFunctionBegin;
548   switch (op) {
549   case MAT_FORM_EXPLICIT_TRANSPOSE:
550     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
551     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
552     A->form_explicit_transpose = flg;
553     break;
554   default:
555     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
556     break;
557   }
558   PetscFunctionReturn(PETSC_SUCCESS);
559 }
560 
561 /* Depending on reuse, either build a new mat, or use the existing mat */
562 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
563 {
564   Mat_SeqAIJ *aseq;
565 
566   PetscFunctionBegin;
567   PetscCall(PetscKokkosInitializeCheck());
568   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
569     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
570     PetscCall(MatSetType(*newmat, mtype));
571   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
572     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
573   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
574     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
575     PetscCall(PetscFree(A->defaultvectype));
576     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
577     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
578     PetscCall(MatSetOps_SeqAIJKokkos(A));
579     aseq = static_cast<Mat_SeqAIJ *>(A->data);
580     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
581       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
582       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
583     }
584   }
585   PetscFunctionReturn(PETSC_SUCCESS);
586 }
587 
588 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
589    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
590  */
591 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
592 {
593   Mat_SeqAIJ       *bseq;
594   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
595   Mat               mat;
596 
597   PetscFunctionBegin;
598   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
599   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
600   mat = *B;
601   if (A->assembled) {
602     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
603     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
604     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
605     /* Now copy values to B if needed */
606     if (dupOption == MAT_COPY_VALUES) {
607       if (akok->a_dual.need_sync_device()) {
608         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
609         bkok->a_dual.modify_host();
610       } else { /* If device has the latest data, we only copy data on device */
611         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
612         bkok->a_dual.modify_device();
613       }
614     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
615       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
616       bkok->a_dual.modify_host();
617     }
618     mat->spptr = bkok;
619   }
620 
621   PetscCall(PetscFree(mat->defaultvectype));
622   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
623   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
624   PetscCall(MatSetOps_SeqAIJKokkos(mat));
625   PetscFunctionReturn(PETSC_SUCCESS);
626 }
627 
628 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
629 {
630   Mat               At;
631   KokkosCsrMatrix   internT;
632   Mat_SeqAIJKokkos *atkok, *bkok;
633 
634   PetscFunctionBegin;
635   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
636   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
637   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
638     /* Deep copy internT, as we want to isolate the internal transpose */
639     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
640     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
641     if (reuse == MAT_INITIAL_MATRIX) *B = At;
642     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
643   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
644     if ((*B)->assembled) {
645       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
646       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
647       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
648     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
649       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
650       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
651       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
652       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
653       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
654     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
655   }
656   PetscFunctionReturn(PETSC_SUCCESS);
657 }
658 
659 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
660 {
661   Mat_SeqAIJKokkos *aijkok;
662 
663   PetscFunctionBegin;
664   if (A->factortype == MAT_FACTOR_NONE) {
665     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
666     delete aijkok;
667   } else {
668     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
669   }
670   A->spptr = NULL;
671   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
672   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
673   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
674 #if defined(PETSC_HAVE_HYPRE)
675   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
676 #endif
677   PetscCall(MatDestroy_SeqAIJ(A));
678   PetscFunctionReturn(PETSC_SUCCESS);
679 }
680 
681 /*MC
682    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
683 
684    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
685 
686    Options Database Key:
687 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
688 
689   Level: beginner
690 
691 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
692 M*/
693 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
694 {
695   PetscFunctionBegin;
696   PetscCall(PetscKokkosInitializeCheck());
697   PetscCall(MatCreate_SeqAIJ(A));
698   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
699   PetscFunctionReturn(PETSC_SUCCESS);
700 }
701 
702 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
703 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
704 {
705   Mat_SeqAIJ         *a, *b;
706   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
707   MatScalarKokkosView aa, ba, ca;
708   MatRowMapKokkosView ai, bi, ci;
709   MatColIdxKokkosView aj, bj, cj;
710   PetscInt            m, n, nnz, aN;
711 
712   PetscFunctionBegin;
713   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
714   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
715   PetscAssertPointer(C, 4);
716   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
717   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
718   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
719   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
720 
721   PetscCall(MatSeqAIJKokkosSyncDevice(A));
722   PetscCall(MatSeqAIJKokkosSyncDevice(B));
723   a    = static_cast<Mat_SeqAIJ *>(A->data);
724   b    = static_cast<Mat_SeqAIJ *>(B->data);
725   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
726   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
727   aa   = akok->a_dual.view_device();
728   ai   = akok->i_dual.view_device();
729   ba   = bkok->a_dual.view_device();
730   bi   = bkok->i_dual.view_device();
731   m    = A->rmap->n; /* M, N and nnz of C */
732   n    = A->cmap->n + B->cmap->n;
733   nnz  = a->nz + b->nz;
734   aN   = A->cmap->n; /* N of A */
735   if (reuse == MAT_INITIAL_MATRIX) {
736     aj           = akok->j_dual.view_device();
737     bj           = bkok->j_dual.view_device();
738     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
739     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
740     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
741     ca           = ca_dual.view_device();
742     ci           = ci_dual.view_device();
743     cj           = cj_dual.view_device();
744 
745     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
746     Kokkos::parallel_for(
747       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
748         PetscInt i       = t.league_rank(); /* row i */
749         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
750 
751         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
752                                                    ci(i) = coffset;
753                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
754         });
755 
756         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
757           if (k < alen) {
758             ca(coffset + k) = aa(ai(i) + k);
759             cj(coffset + k) = aj(ai(i) + k);
760           } else {
761             ca(coffset + k) = ba(bi(i) + k - alen);
762             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
763           }
764         });
765       });
766     ca_dual.modify_device();
767     ci_dual.modify_device();
768     cj_dual.modify_device();
769     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
770     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
771   } else if (reuse == MAT_REUSE_MATRIX) {
772     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
773     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
774     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
775     ca   = ckok->a_dual.view_device();
776     ci   = ckok->i_dual.view_device();
777 
778     Kokkos::parallel_for(
779       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
780         PetscInt i    = t.league_rank(); /* row i */
781         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
782         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
783           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
784           else ca(ci(i) + k) = ba(bi(i) + k - alen);
785         });
786       });
787     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
788   }
789   PetscFunctionReturn(PETSC_SUCCESS);
790 }
791 
792 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
793 {
794   PetscFunctionBegin;
795   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
796   PetscFunctionReturn(PETSC_SUCCESS);
797 }
798 
799 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
800 {
801   Mat_Product                 *product = C->product;
802   Mat                          A, B;
803   bool                         transA, transB; /* use bool, since KK needs this type */
804   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
805   Mat_SeqAIJ                  *c;
806   MatProductData_SeqAIJKokkos *pdata;
807   KokkosCsrMatrix              csrmatA, csrmatB;
808 
809   PetscFunctionBegin;
810   MatCheckProduct(C, 1);
811   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
812   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
813 
814   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
815   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
816   // we still do numeric.
817   if (pdata->reusesym) { // numeric reuses results from symbolic
818     pdata->reusesym = PETSC_FALSE;
819     PetscFunctionReturn(PETSC_SUCCESS);
820   }
821 
822   switch (product->type) {
823   case MATPRODUCT_AB:
824     transA = false;
825     transB = false;
826     break;
827   case MATPRODUCT_AtB:
828     transA = true;
829     transB = false;
830     break;
831   case MATPRODUCT_ABt:
832     transA = false;
833     transB = true;
834     break;
835   default:
836     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
837   }
838 
839   A = product->A;
840   B = product->B;
841   PetscCall(MatSeqAIJKokkosSyncDevice(A));
842   PetscCall(MatSeqAIJKokkosSyncDevice(B));
843   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
844   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
845   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
846 
847   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
848 
849   csrmatA = akok->csrmat;
850   csrmatB = bkok->csrmat;
851 
852   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
853   if (transA) {
854     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
855     transA = false;
856   }
857 
858   if (transB) {
859     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
860     transB = false;
861   }
862   PetscCall(PetscLogGpuTimeBegin());
863   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
864 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
865   auto spgemmHandle = pdata->kh.get_spgemm_handle();
866   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
867 #endif
868 
869   PetscCall(PetscLogGpuTimeEnd());
870   PetscCall(MatSeqAIJKokkosModifyDevice(C));
871   /* shorter version of MatAssemblyEnd_SeqAIJ */
872   c = (Mat_SeqAIJ *)C->data;
873   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
874   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
875   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
876   c->reallocs         = 0;
877   C->info.mallocs     = 0;
878   C->info.nz_unneeded = 0;
879   C->assembled = C->was_assembled = PETSC_TRUE;
880   C->num_ass++;
881   PetscFunctionReturn(PETSC_SUCCESS);
882 }
883 
884 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
885 {
886   Mat_Product                 *product = C->product;
887   MatProductType               ptype;
888   Mat                          A, B;
889   bool                         transA, transB;
890   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
891   MatProductData_SeqAIJKokkos *pdata;
892   MPI_Comm                     comm;
893   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
894 
895   PetscFunctionBegin;
896   MatCheckProduct(C, 1);
897   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
898   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
899   A = product->A;
900   B = product->B;
901   PetscCall(MatSeqAIJKokkosSyncDevice(A));
902   PetscCall(MatSeqAIJKokkosSyncDevice(B));
903   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
904   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
905   csrmatA = akok->csrmat;
906   csrmatB = bkok->csrmat;
907 
908   ptype = product->type;
909   // Take advantage of the symmetry if true
910   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
911     ptype                                          = MATPRODUCT_AB;
912     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
913   }
914   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
915     ptype                                          = MATPRODUCT_AB;
916     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
917   }
918 
919   switch (ptype) {
920   case MATPRODUCT_AB:
921     transA = false;
922     transB = false;
923     PetscCall(MatSetBlockSizesFromMats(C, A, B));
924     break;
925   case MATPRODUCT_AtB:
926     transA = true;
927     transB = false;
928     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
929     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
930     break;
931   case MATPRODUCT_ABt:
932     transA = false;
933     transB = true;
934     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
935     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
936     break;
937   default:
938     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
939   }
940   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
941   pdata->reusesym = product->api_user;
942 
943   /* TODO: add command line options to select spgemm algorithms */
944   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
945 
946   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
947 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
948   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
949   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
950   #endif
951 #endif
952   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
953 
954   PetscCall(PetscLogGpuTimeBegin());
955   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
956   if (transA) {
957     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
958     transA = false;
959   }
960 
961   if (transB) {
962     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
963     transB = false;
964   }
965 
966   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
967   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
968     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
969     calling new Mat_SeqAIJKokkos().
970     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
971   */
972   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
973 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
974   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
975   auto spgemmHandle = pdata->kh.get_spgemm_handle();
976   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
977 #endif
978   PetscCall(PetscLogGpuTimeEnd());
979 
980   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
981   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
982   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
983   PetscFunctionReturn(PETSC_SUCCESS);
984 }
985 
986 /* handles sparse matrix matrix ops */
987 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
988 {
989   Mat_Product *product = mat->product;
990   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
991 
992   PetscFunctionBegin;
993   MatCheckProduct(mat, 1);
994   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
995   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
996   if (Biskok && Ciskok) {
997     switch (product->type) {
998     case MATPRODUCT_AB:
999     case MATPRODUCT_AtB:
1000     case MATPRODUCT_ABt:
1001       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1002       break;
1003     case MATPRODUCT_PtAP:
1004     case MATPRODUCT_RARt:
1005     case MATPRODUCT_ABC:
1006       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1007       break;
1008     default:
1009       break;
1010     }
1011   } else { /* fallback for AIJ */
1012     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1013   }
1014   PetscFunctionReturn(PETSC_SUCCESS);
1015 }
1016 
1017 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1018 {
1019   Mat_SeqAIJKokkos *aijkok;
1020 
1021   PetscFunctionBegin;
1022   PetscCall(PetscLogGpuTimeBegin());
1023   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1024   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1025   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1026   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1027   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1028   PetscCall(PetscLogGpuTimeEnd());
1029   PetscFunctionReturn(PETSC_SUCCESS);
1030 }
1031 
1032 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1033 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1034 {
1035   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1036 
1037   PetscFunctionBegin;
1038   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1039     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1040 
1041     PetscCall(PetscLogGpuTimeBegin());
1042     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1043     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1044     const auto &Aa     = aijkok->a_dual.view_device();
1045     const auto &Adiag  = aijkok->diag_dual.view_device();
1046     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1047     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1048     PetscCall(PetscLogGpuFlops(n));
1049     PetscCall(PetscLogGpuTimeEnd());
1050   } else { // need reassembly, very slow!
1051     PetscCall(MatShift_Basic(A, a));
1052   }
1053   PetscFunctionReturn(PETSC_SUCCESS);
1054 }
1055 
1056 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1057 {
1058   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1059 
1060   PetscFunctionBegin;
1061   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1062     ConstPetscScalarKokkosView dv;
1063     PetscInt                   n, nv;
1064 
1065     PetscCall(PetscLogGpuTimeBegin());
1066     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1067     PetscCall(VecGetKokkosView(D, &dv));
1068     PetscCall(VecGetLocalSize(D, &nv));
1069     n = PetscMin(Y->rmap->n, Y->cmap->n);
1070     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1071 
1072     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1073     const auto &Aa     = aijkok->a_dual.view_device();
1074     const auto &Adiag  = aijkok->diag_dual.view_device();
1075     PetscCallCXX(Kokkos::parallel_for(
1076       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1077         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1078         else Aa(Adiag(i)) += dv(i);
1079       }));
1080     PetscCall(VecRestoreKokkosView(D, &dv));
1081     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1082     PetscCall(PetscLogGpuFlops(n));
1083     PetscCall(PetscLogGpuTimeEnd());
1084   } else { // need reassembly, very slow!
1085     PetscCall(MatDiagonalSet_Default(Y, D, is));
1086   }
1087   PetscFunctionReturn(PETSC_SUCCESS);
1088 }
1089 
1090 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1091 {
1092   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1093   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1094   ConstPetscScalarKokkosView lv, rv;
1095 
1096   PetscFunctionBegin;
1097   PetscCall(PetscLogGpuTimeBegin());
1098   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1099   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1100   const auto &Aa     = aijkok->a_dual.view_device();
1101   const auto &Ai     = aijkok->i_dual.view_device();
1102   const auto &Aj     = aijkok->j_dual.view_device();
1103   if (ll) {
1104     PetscCall(VecGetLocalSize(ll, &m));
1105     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1106     PetscCall(VecGetKokkosView(ll, &lv));
1107     PetscCallCXX(Kokkos::parallel_for( // for each row
1108       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1109         PetscInt i   = t.league_rank(); // row i
1110         PetscInt len = Ai(i + 1) - Ai(i);
1111         // scale entries on the row
1112         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1113       }));
1114     PetscCall(VecRestoreKokkosView(ll, &lv));
1115     PetscCall(PetscLogGpuFlops(nz));
1116   }
1117   if (rr) {
1118     PetscCall(VecGetLocalSize(rr, &n));
1119     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1120     PetscCall(VecGetKokkosView(rr, &rv));
1121     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1122       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1123     PetscCall(VecRestoreKokkosView(rr, &lv));
1124     PetscCall(PetscLogGpuFlops(nz));
1125   }
1126   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1127   PetscCall(PetscLogGpuTimeEnd());
1128   PetscFunctionReturn(PETSC_SUCCESS);
1129 }
1130 
1131 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1132 {
1133   Mat_SeqAIJKokkos *aijkok;
1134 
1135   PetscFunctionBegin;
1136   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1137   if (aijkok) { /* Only zero the device if data is already there */
1138     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1139     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1140   } else { /* Might be preallocated but not assembled */
1141     PetscCall(MatZeroEntries_SeqAIJ(A));
1142   }
1143   PetscFunctionReturn(PETSC_SUCCESS);
1144 }
1145 
1146 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1147 {
1148   Mat_SeqAIJKokkos     *aijkok;
1149   PetscInt              n;
1150   PetscScalarKokkosView xv;
1151 
1152   PetscFunctionBegin;
1153   PetscCall(VecGetLocalSize(x, &n));
1154   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1155   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1156 
1157   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1158   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1159 
1160   const auto &Aa    = aijkok->a_dual.view_device();
1161   const auto &Ai    = aijkok->i_dual.view_device();
1162   const auto &Adiag = aijkok->diag_dual.view_device();
1163 
1164   PetscCall(VecGetKokkosViewWrite(x, &xv));
1165   Kokkos::parallel_for(
1166     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1167       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1168       else xv(i) = 0;
1169     });
1170   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1171   PetscFunctionReturn(PETSC_SUCCESS);
1172 }
1173 
1174 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1175 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1176 {
1177   Mat_SeqAIJKokkos *aijkok;
1178 
1179   PetscFunctionBegin;
1180   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1181   PetscAssertPointer(kv, 2);
1182   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1183   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1184   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1185   *kv    = aijkok->a_dual.view_device();
1186   PetscFunctionReturn(PETSC_SUCCESS);
1187 }
1188 
1189 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1190 {
1191   PetscFunctionBegin;
1192   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1193   PetscAssertPointer(kv, 2);
1194   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1195   PetscFunctionReturn(PETSC_SUCCESS);
1196 }
1197 
1198 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1199 {
1200   Mat_SeqAIJKokkos *aijkok;
1201 
1202   PetscFunctionBegin;
1203   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1204   PetscAssertPointer(kv, 2);
1205   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1206   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1207   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1208   *kv    = aijkok->a_dual.view_device();
1209   PetscFunctionReturn(PETSC_SUCCESS);
1210 }
1211 
1212 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1213 {
1214   PetscFunctionBegin;
1215   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1216   PetscAssertPointer(kv, 2);
1217   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1218   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1219   PetscFunctionReturn(PETSC_SUCCESS);
1220 }
1221 
1222 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1223 {
1224   Mat_SeqAIJKokkos *aijkok;
1225 
1226   PetscFunctionBegin;
1227   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1228   PetscAssertPointer(kv, 2);
1229   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1230   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1231   *kv    = aijkok->a_dual.view_device();
1232   PetscFunctionReturn(PETSC_SUCCESS);
1233 }
1234 
1235 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1236 {
1237   PetscFunctionBegin;
1238   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1239   PetscAssertPointer(kv, 2);
1240   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1241   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1242   PetscFunctionReturn(PETSC_SUCCESS);
1243 }
1244 
1245 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1246 {
1247   Mat_SeqAIJKokkos *akok;
1248 
1249   PetscFunctionBegin;
1250   // Create host copies of the input aij
1251   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1252   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1253   // Don't copy the vals to the host now
1254   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1255 
1256   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1257   // Note we have modified device data so it will copy lazily
1258   a_dual.modify_device();
1259   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1260   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1261 
1262   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1263   PetscCall(MatCreate(comm, A));
1264   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1265   PetscFunctionReturn(PETSC_SUCCESS);
1266 }
1267 
1268 /* Computes Y += alpha X */
1269 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1270 {
1271   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1272   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1273   ConstMatScalarKokkosView Xa;
1274   MatScalarKokkosView      Ya;
1275   auto                     exec = PetscGetKokkosExecutionSpace();
1276 
1277   PetscFunctionBegin;
1278   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1279   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1280   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1281   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1282   PetscCall(PetscLogGpuTimeBegin());
1283 
1284   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1285     PetscBool e;
1286     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1287     if (e) {
1288       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1289       if (e) pattern = SAME_NONZERO_PATTERN;
1290     }
1291   }
1292 
1293   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1294     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1295     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1296     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1297   */
1298   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1299   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1300   Xa   = xkok->a_dual.view_device();
1301   Ya   = ykok->a_dual.view_device();
1302 
1303   if (pattern == SAME_NONZERO_PATTERN) {
1304     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1305     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1306   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1307     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1308     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1309 
1310     Kokkos::parallel_for(
1311       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1312         PetscInt i = t.league_rank(); // row i
1313         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1314           // Only one thread works in a team
1315           PetscInt p, q = Yi(i);
1316           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1317             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1318             if (Xj(p) == Yj(q)) {                        // Found it
1319               Ya(q) += alpha * Xa(p);
1320               q++;
1321             } else {
1322             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1323             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1324 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1325               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1326 #else
1327               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1328 #endif
1329             }
1330           }
1331         });
1332       });
1333     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1334   } else { // different nonzero patterns
1335     Mat             Z;
1336     KokkosCsrMatrix zcsr;
1337     KernelHandle    kh;
1338     kh.create_spadd_handle(true); // X, Y are sorted
1339     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1340     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1341     zkok = new Mat_SeqAIJKokkos(zcsr);
1342     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1343     PetscCall(MatHeaderReplace(Y, &Z));
1344     kh.destroy_spadd_handle();
1345   }
1346   PetscCall(PetscLogGpuTimeEnd());
1347   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1348   PetscFunctionReturn(PETSC_SUCCESS);
1349 }
1350 
1351 struct MatCOOStruct_SeqAIJKokkos {
1352   PetscCount           n;
1353   PetscCount           Atot;
1354   PetscInt             nz;
1355   PetscCountKokkosView jmap;
1356   PetscCountKokkosView perm;
1357 
1358   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1359   {
1360     nz   = coo_h->nz;
1361     n    = coo_h->n;
1362     Atot = coo_h->Atot;
1363     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1364     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1365   }
1366 };
1367 
1368 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1369 {
1370   PetscFunctionBegin;
1371   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1372   PetscFunctionReturn(PETSC_SUCCESS);
1373 }
1374 
1375 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1376 {
1377   Mat_SeqAIJKokkos          *akok;
1378   Mat_SeqAIJ                *aseq;
1379   PetscContainer             container_h;
1380   MatCOOStruct_SeqAIJ       *coo_h;
1381   MatCOOStruct_SeqAIJKokkos *coo_d;
1382 
1383   PetscFunctionBegin;
1384   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1385   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1386   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1387   delete akok;
1388   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1389   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1390 
1391   // Copy the COO struct to device
1392   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1393   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1394   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1395 
1396   // Put the COO struct in a container and then attach that to the matrix
1397   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1398   PetscFunctionReturn(PETSC_SUCCESS);
1399 }
1400 
1401 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1402 {
1403   MatScalarKokkosView        Aa;
1404   ConstMatScalarKokkosView   kv;
1405   PetscMemType               memtype;
1406   PetscContainer             container;
1407   MatCOOStruct_SeqAIJKokkos *coo;
1408 
1409   PetscFunctionBegin;
1410   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1411   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1412 
1413   const auto &n    = coo->n;
1414   const auto &Annz = coo->nz;
1415   const auto &jmap = coo->jmap;
1416   const auto &perm = coo->perm;
1417 
1418   PetscCall(PetscGetMemType(v, &memtype));
1419   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1420     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1421   } else {
1422     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1423   }
1424 
1425   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1426   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1427 
1428   PetscCall(PetscLogGpuTimeBegin());
1429   Kokkos::parallel_for(
1430     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1431       PetscScalar sum = 0.0;
1432       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1433       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1434     });
1435   PetscCall(PetscLogGpuTimeEnd());
1436 
1437   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1438   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1439   PetscFunctionReturn(PETSC_SUCCESS);
1440 }
1441 
1442 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1443 {
1444   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1445 
1446   PetscFunctionBegin;
1447   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1448   A->boundtocpu  = PETSC_FALSE;
1449 
1450   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1451   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1452   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1453   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1454   A->ops->scale                     = MatScale_SeqAIJKokkos;
1455   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1456   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1457   A->ops->mult                      = MatMult_SeqAIJKokkos;
1458   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1459   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1460   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1461   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1462   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1463   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1464   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1465   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1466   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1467   A->ops->shift                     = MatShift_SeqAIJKokkos;
1468   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1469   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1470   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1471   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1472   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1473   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1474   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1475   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1476   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1477 
1478   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1479   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1480 #if defined(PETSC_HAVE_HYPRE)
1481   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1482 #endif
1483   PetscFunctionReturn(PETSC_SUCCESS);
1484 }
1485 
1486 /*
1487    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1488 
1489   Input Parameters:
1490 +  A       - the MATSEQAIJKOKKOS matrix
1491 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1492 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1493 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1494 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1495 
1496   Output Parameter:
1497 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1498 */
1499 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1500 {
1501   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1502   PetscInt          nblocks = bs.extent(0) - 1;
1503 
1504   PetscFunctionBegin;
1505   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1506 
1507   // Pull out the diagonal blocks of the matrix and then invert the blocks
1508   auto Aa    = akok->a_dual.view_device();
1509   auto Ai    = akok->i_dual.view_device();
1510   auto Aj    = akok->j_dual.view_device();
1511   auto Adiag = akok->diag_dual.view_device();
1512   // TODO: how to tune the team size?
1513 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1514   auto ts = Kokkos::AUTO();
1515 #else
1516   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1517 #endif
1518   PetscCallCXX(Kokkos::parallel_for(
1519     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1520       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1521       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1522       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1523       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1524       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1525 
1526       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1527         PetscInt i = rstart + r;                                                            // i-th row in A
1528 
1529         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1530           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1531 
1532           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1533             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1534               B(r, c) = 0.0;
1535             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1536               B(r, c) = Aa(first + c);
1537             } else { // this entry does not show up in the CSR
1538               B(r, c) = 0.0;
1539             }
1540           }
1541         } else { // rare case that the diagonal does not exist
1542           const PetscInt begin = Ai(i);
1543           const PetscInt end   = Ai(i + 1);
1544           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1545           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1546             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1547             else if (Aj(j) >= rstart + m) break;
1548           }
1549         }
1550       });
1551 
1552       // LU-decompose B (w/o pivoting) and then invert B
1553       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1554       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1555     }));
1556   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1557   PetscFunctionReturn(PETSC_SUCCESS);
1558 }
1559 
1560 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1561 {
1562   Mat_SeqAIJ *aseq;
1563   PetscInt    i, m, n;
1564   auto        exec = PetscGetKokkosExecutionSpace();
1565 
1566   PetscFunctionBegin;
1567   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1568 
1569   m = akok->nrows();
1570   n = akok->ncols();
1571   PetscCall(MatSetSizes(A, m, n, m, n));
1572   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1573 
1574   /* Set up data structures of A as a MATSEQAIJ */
1575   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1576   aseq = (Mat_SeqAIJ *)A->data;
1577 
1578   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1579   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1580 
1581   aseq->i       = akok->i_host_data();
1582   aseq->j       = akok->j_host_data();
1583   aseq->a       = akok->a_host_data();
1584   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1585   aseq->free_a  = PETSC_FALSE;
1586   aseq->free_ij = PETSC_FALSE;
1587   aseq->nz      = akok->nnz();
1588   aseq->maxnz   = aseq->nz;
1589 
1590   PetscCall(PetscMalloc1(m, &aseq->imax));
1591   PetscCall(PetscMalloc1(m, &aseq->ilen));
1592   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1593 
1594   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1595   akok->nonzerostate = A->nonzerostate;
1596   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1597   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1598   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1599   PetscFunctionReturn(PETSC_SUCCESS);
1600 }
1601 
1602 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1603 {
1604   PetscFunctionBegin;
1605   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1606   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1607   PetscFunctionReturn(PETSC_SUCCESS);
1608 }
1609 
1610 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1611 {
1612   Mat_SeqAIJKokkos *akok;
1613 
1614   PetscFunctionBegin;
1615   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1616   PetscCall(MatCreate(comm, A));
1617   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1618   PetscFunctionReturn(PETSC_SUCCESS);
1619 }
1620 
1621 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1622 
1623    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1624  */
1625 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1626 {
1627   PetscFunctionBegin;
1628   PetscCall(MatCreate(comm, A));
1629   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1630   PetscFunctionReturn(PETSC_SUCCESS);
1631 }
1632 
1633 /*@C
1634   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1635   (the default parallel PETSc format). This matrix will ultimately be handled by
1636   Kokkos for calculations.
1637 
1638   Collective
1639 
1640   Input Parameters:
1641 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1642 . m    - number of rows
1643 . n    - number of columns
1644 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1645 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1646 
1647   Output Parameter:
1648 . A - the matrix
1649 
1650   Level: intermediate
1651 
1652   Notes:
1653   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1654   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1655   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1656 
1657   The AIJ format, also called
1658   compressed row storage, is fully compatible with standard Fortran
1659   storage.  That is, the stored row and column indices can begin at
1660   either one (as in Fortran) or zero.
1661 
1662   Specify the preallocated storage with either `nz` or `nnz` (not both).
1663   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1664   allocation.
1665 
1666 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1667 @*/
1668 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1669 {
1670   PetscFunctionBegin;
1671   PetscCall(PetscKokkosInitializeCheck());
1672   PetscCall(MatCreate(comm, A));
1673   PetscCall(MatSetSizes(*A, m, n, m, n));
1674   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1675   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1676   PetscFunctionReturn(PETSC_SUCCESS);
1677 }
1678 
1679 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1680 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1681 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1682 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1683 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1684 {
1685   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1686   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1687   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1688 
1689   PetscFunctionBegin;
1690   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1691     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1692     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1693     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1694   }
1695   PetscFunctionReturn(PETSC_SUCCESS);
1696 }
1697 
1698 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1699 {
1700   const PetscInt              n         = A->rmap->n;
1701   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1702   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1703   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1704 
1705   PetscFunctionBegin;
1706   if (!factors->transpose_updated) {
1707     if (has_upper) {
1708       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1709         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1710         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1711         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1712       }
1713 
1714       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1715         if (!factors->U) {
1716           Mat_SeqAIJ *seq;
1717 
1718           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1719           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1720 
1721           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1722           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1723           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1724           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1725         } else {
1726           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1727         }
1728         // Copy Ut from host to device
1729         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1730         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1731         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1732       } else { // If U was computed on device, we also compute the transpose there
1733         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1734         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1735                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1736                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1737                                                                                                                                                                                                                                factors->aUt_d));
1738         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1739       }
1740       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1741     }
1742 
1743     // do the same for L with LU
1744     if (has_lower) {
1745       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1746         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1747         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1748         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1749       }
1750 
1751       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1752         if (!factors->L) {
1753           Mat_SeqAIJ *seq;
1754 
1755           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1756           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1757 
1758           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1759           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1760           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1761           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1762         } else {
1763           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1764         }
1765         // Copy Lt from host to device
1766         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1767         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1768         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1769       } else { // If L was computed on device, we also compute the transpose there
1770         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1771         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1772                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1773                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1774                                                                                                                                                                                                                                factors->aLt_d));
1775         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1776       }
1777       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1778     }
1779 
1780     factors->transpose_updated = PETSC_TRUE;
1781   }
1782   PetscFunctionReturn(PETSC_SUCCESS);
1783 }
1784 
1785 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1786 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1787 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1788 {
1789   auto                        exec    = PetscGetKokkosExecutionSpace();
1790   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1791   PetscInt                    m       = A->rmap->n;
1792   PetscScalarKokkosView       D       = factors->D_d;
1793   PetscScalarKokkosView       X, Y, B; // alias
1794   ConstPetscScalarKokkosView  b;
1795   PetscScalarKokkosView       x;
1796   PetscIntKokkosView         &rowperm  = factors->rowperm;
1797   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1798 
1799   PetscFunctionBegin;
1800   PetscCall(PetscLogGpuTimeBegin());
1801   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1802   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1803   PetscCall(VecGetKokkosView(bb, &b));
1804   PetscCall(VecGetKokkosViewWrite(xx, &x));
1805 
1806   // Solve U^T Y = B
1807   if (identity) { // Reorder b with the row permutation
1808     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1809     Y = factors->workVector;
1810   } else {
1811     B = factors->workVector;
1812     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1813     Y = x;
1814   }
1815   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1816 
1817   // Solve diag(D) Y' = Y.
1818   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1819   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1820 
1821   // Solve UX = Y
1822   if (identity) {
1823     X = x;
1824   } else {
1825     X = factors->workVector; // B is not needed anymore
1826   }
1827   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1828 
1829   // Reorder X with the inverse column (row) permutation
1830   if (!identity) {
1831     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1832   }
1833 
1834   PetscCall(VecRestoreKokkosView(bb, &b));
1835   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1836   PetscCall(PetscLogGpuTimeEnd());
1837   PetscFunctionReturn(PETSC_SUCCESS);
1838 }
1839 
1840 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1841 // R and C are represented by rowperm and colperm in factors.
1842 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1843 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1844 {
1845   auto                        exec    = PetscGetKokkosExecutionSpace();
1846   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1847   PetscInt                    m       = A->rmap->n;
1848   PetscScalarKokkosView       X, Y, B; // alias
1849   ConstPetscScalarKokkosView  b;
1850   PetscScalarKokkosView       x;
1851   PetscIntKokkosView         &rowperm      = factors->rowperm;
1852   PetscIntKokkosView         &colperm      = factors->colperm;
1853   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1854   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1855 
1856   PetscFunctionBegin;
1857   PetscCall(PetscLogGpuTimeBegin());
1858   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1859   PetscCall(VecGetKokkosView(bb, &b));
1860   PetscCall(VecGetKokkosViewWrite(xx, &x));
1861 
1862   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1863   if (row_identity) {
1864     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1865     Y = factors->workVector;
1866   } else {
1867     B = factors->workVector;
1868     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1869     Y = x;
1870   }
1871   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1872 
1873   // Solve U C^- x = Y
1874   if (col_identity) {
1875     X = x;
1876   } else {
1877     X = factors->workVector;
1878   }
1879   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1880 
1881   // x = C X; Reorder X with the inverse col permutation
1882   if (!col_identity) {
1883     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1884   }
1885 
1886   PetscCall(VecRestoreKokkosView(bb, &b));
1887   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1888   PetscCall(PetscLogGpuTimeEnd());
1889   PetscFunctionReturn(PETSC_SUCCESS);
1890 }
1891 
1892 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1893 // R and C are represented by rowperm and colperm in factors.
1894 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1895 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1896 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1897 {
1898   auto                        exec    = PetscGetKokkosExecutionSpace();
1899   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1900   PetscInt                    m       = A->rmap->n;
1901   PetscScalarKokkosView       X, Y, B; // alias
1902   ConstPetscScalarKokkosView  b;
1903   PetscScalarKokkosView       x;
1904   PetscIntKokkosView         &rowperm      = factors->rowperm;
1905   PetscIntKokkosView         &colperm      = factors->colperm;
1906   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1907   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1908 
1909   PetscFunctionBegin;
1910   PetscCall(PetscLogGpuTimeBegin());
1911   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1912   PetscCall(VecGetKokkosView(bb, &b));
1913   PetscCall(VecGetKokkosViewWrite(xx, &x));
1914 
1915   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1916   if (col_identity) { // Reorder b with the col permutation
1917     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1918     Y = factors->workVector;
1919   } else {
1920     B = factors->workVector;
1921     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1922     Y = x;
1923   }
1924   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1925 
1926   // Solve L^T X = Y
1927   if (row_identity) {
1928     X = x;
1929   } else {
1930     X = factors->workVector;
1931   }
1932   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1933 
1934   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1935   if (!row_identity) {
1936     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1937   }
1938 
1939   PetscCall(VecRestoreKokkosView(bb, &b));
1940   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1941   PetscCall(PetscLogGpuTimeEnd());
1942   PetscFunctionReturn(PETSC_SUCCESS);
1943 }
1944 
1945 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1946 {
1947   PetscFunctionBegin;
1948   PetscCall(MatSeqAIJKokkosSyncHost(A));
1949   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1950 
1951   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1952     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1953     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1954     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1955     const MatScalar            *Ba = b->a;
1956     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1957 
1958     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1959       // Allocate memory and copy the structure
1960       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1961       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1962       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1963       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1964       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1965       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1966 
1967       PetscInt *Li = factors->iL_h.data();
1968       PetscInt *Lj = factors->jL_h.data();
1969       PetscInt *Ui = factors->iU_h.data();
1970       PetscInt *Uj = factors->jU_h.data();
1971 
1972       Li[0] = Ui[0] = 0;
1973       for (PetscInt i = 0; i < m; i++) {
1974         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1975         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1976 
1977         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1978         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1979 
1980         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1981         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1982 
1983         Li[i + 1] = Li[i] + llen + 1;
1984         Ui[i + 1] = Ui[i] + ulen;
1985       }
1986 
1987       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1988       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1989       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1990       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1991       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1992       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1993 
1994       // Copy row/col permutation to device
1995       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1996       PetscBool row_identity;
1997       PetscCall(ISIdentity(rowperm, &row_identity));
1998       if (!row_identity) {
1999         const PetscInt *ip;
2000 
2001         PetscCall(ISGetIndices(rowperm, &ip));
2002         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
2003         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2004         PetscCall(ISRestoreIndices(rowperm, &ip));
2005         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2006       }
2007 
2008       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
2009       PetscBool col_identity;
2010       PetscCall(ISIdentity(colperm, &col_identity));
2011       if (!col_identity) {
2012         const PetscInt *ip;
2013 
2014         PetscCall(ISGetIndices(colperm, &ip));
2015         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2016         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2017         PetscCall(ISRestoreIndices(colperm, &ip));
2018         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2019       }
2020 
2021       /* Create sptrsv handles for L, U and their transpose */
2022 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2023       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2024 #else
2025       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2026 #endif
2027       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2028       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2029       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2030       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2031     }
2032 
2033     // Copy the value
2034     for (PetscInt i = 0; i < m; i++) {
2035       PetscInt        llen = Bi[i + 1] - Bi[i];
2036       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2037       const PetscInt *Li   = factors->iL_h.data();
2038       const PetscInt *Ui   = factors->iU_h.data();
2039 
2040       PetscScalar *La = factors->aL_h.data();
2041       PetscScalar *Ua = factors->aU_h.data();
2042 
2043       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2044       La[Li[i] + llen] = 1.0;                      // diagonal entry
2045 
2046       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2047       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2048     }
2049 
2050     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2051     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2052     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2053     factors->transpose_updated         = PETSC_FALSE;
2054     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2055 
2056     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2057     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2058   }
2059 
2060   B->ops->matsolve          = NULL;
2061   B->ops->matsolvetranspose = NULL;
2062   PetscFunctionReturn(PETSC_SUCCESS);
2063 }
2064 
2065 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2066 {
2067   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2068   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2069   PetscInt                    fill_lev = info->levels;
2070 
2071   PetscFunctionBegin;
2072   PetscCall(PetscLogGpuTimeBegin());
2073   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2074   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2075 
2076   auto a_d = aijkok->a_dual.view_device();
2077   auto i_d = aijkok->i_dual.view_device();
2078   auto j_d = aijkok->j_dual.view_device();
2079 
2080   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2081 
2082   B->assembled              = PETSC_TRUE;
2083   B->preallocated           = PETSC_TRUE;
2084   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2085   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2086   B->ops->matsolve          = NULL;
2087   B->ops->matsolvetranspose = NULL;
2088 
2089   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2090   factors->transpose_updated         = PETSC_FALSE;
2091   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2092   /* TODO: log flops, but how to know that? */
2093   PetscCall(PetscLogGpuTimeEnd());
2094   PetscFunctionReturn(PETSC_SUCCESS);
2095 }
2096 
2097 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2098 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2099 {
2100   Mat_SeqAIJKokkos           *aijkok;
2101   Mat_SeqAIJ                 *b;
2102   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2103   PetscInt                    fill_lev = info->levels;
2104   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2105   PetscInt                    n        = A->rmap->n;
2106 
2107   PetscFunctionBegin;
2108   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2109   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2110 
2111   /* Create a spiluk handle and then do symbolic factorization */
2112   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2113   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2114 
2115   auto spiluk_handle = factors->kh.get_spiluk_handle();
2116 
2117   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2118   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2119   Kokkos::realloc(factors->iU_d, n + 1);
2120   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2121 
2122   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2123   auto i_d = aijkok->i_dual.view_device();
2124   auto j_d = aijkok->j_dual.view_device();
2125   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2126   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2127 
2128   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2129   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2130   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2131   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2132 
2133   /* TODO: add options to select sptrsv algorithms */
2134   /* Create sptrsv handles for L, U and their transpose */
2135 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2136   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2137 #else
2138   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2139 #endif
2140 
2141   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2142   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2143   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2144   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2145 
2146   /* Fill fields of the factor matrix B */
2147   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2148   b     = (Mat_SeqAIJ *)B->data;
2149   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2150   B->info.fill_ratio_given  = info->fill;
2151   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2152 
2153   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2154   PetscFunctionReturn(PETSC_SUCCESS);
2155 }
2156 
2157 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2158 {
2159   PetscFunctionBegin;
2160   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2161   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2162   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2163   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2164   PetscFunctionReturn(PETSC_SUCCESS);
2165 }
2166 
2167 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2168 {
2169   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2170 
2171   PetscFunctionBegin;
2172   if (!info->factoronhost) {
2173     PetscCall(ISIdentity(isrow, &row_identity));
2174     PetscCall(ISIdentity(iscol, &col_identity));
2175   }
2176 
2177   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2178   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2179 
2180   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2181     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2182   } else {
2183     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2184     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2185   }
2186   PetscFunctionReturn(PETSC_SUCCESS);
2187 }
2188 
2189 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2190 {
2191   PetscFunctionBegin;
2192   PetscCall(MatSeqAIJKokkosSyncHost(A));
2193   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2194 
2195   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2196     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2197     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2198     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2199     const MatScalar            *Ba = b->a;
2200     PetscInt                    m  = B->rmap->n;
2201 
2202     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2203       // Allocate memory and copy the structure
2204       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2205       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2206       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2207       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2208       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2209       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2210 
2211       // Build jU_h from the skewed Aj
2212       PetscInt *Uj = factors->jU_h.data();
2213       for (PetscInt i = 0; i < m; i++) {
2214         PetscInt ulen = Bi[i + 1] - Bi[i];
2215         Uj[Bi[i]]     = i;                                              // diagonal entry
2216         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2217       }
2218 
2219       // Copy iU, jU to device
2220       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2221       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2222 
2223       // Copy row/col permutation to device
2224       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2225       PetscBool row_identity;
2226       PetscCall(ISIdentity(rowperm, &row_identity));
2227       if (!row_identity) {
2228         const PetscInt *ip;
2229 
2230         PetscCall(ISGetIndices(rowperm, &ip));
2231         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2232         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2233         PetscCall(ISRestoreIndices(rowperm, &ip));
2234         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2235       }
2236 
2237       // Create sptrsv handles for U and U^T
2238 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2239       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2240 #else
2241       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2242 #endif
2243       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2244       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2245     }
2246     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2247     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2248     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2249 
2250     // Copy the value
2251     PetscScalar *Ua = factors->aU_h.data();
2252     PetscScalar *D  = factors->D_h.data();
2253     for (PetscInt i = 0; i < m; i++) {
2254       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2255       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2256       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2257     }
2258     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2259     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2260 
2261     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2262     factors->transpose_updated         = PETSC_FALSE;
2263   }
2264 
2265   B->ops->matsolve          = NULL;
2266   B->ops->matsolvetranspose = NULL;
2267   PetscFunctionReturn(PETSC_SUCCESS);
2268 }
2269 
2270 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2271 {
2272   PetscFunctionBegin;
2273   if (info->solveonhost) {
2274     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2275     PetscCall(MatSetType(B, MATSEQSBAIJ));
2276     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2277   }
2278 
2279   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2280 
2281   if (!info->solveonhost) {
2282     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2283     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2284     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2285     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2286   }
2287   PetscFunctionReturn(PETSC_SUCCESS);
2288 }
2289 
2290 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2291 {
2292   PetscFunctionBegin;
2293   if (info->solveonhost) {
2294     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2295     PetscCall(MatSetType(B, MATSEQSBAIJ));
2296     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2297   }
2298 
2299   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2300 
2301   if (!info->solveonhost) {
2302     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2303     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2304     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2305     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2306   }
2307   PetscFunctionReturn(PETSC_SUCCESS);
2308 }
2309 
2310 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2311 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2312 {
2313   PetscFunctionBegin;
2314   *type = MATSOLVERKOKKOS;
2315   PetscFunctionReturn(PETSC_SUCCESS);
2316 }
2317 
2318 /*MC
2319   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2320   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2321 
2322   Level: beginner
2323 
2324 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2325 M*/
2326 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2327 {
2328   PetscInt n = A->rmap->n;
2329   MPI_Comm comm;
2330 
2331   PetscFunctionBegin;
2332   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2333   PetscCall(MatCreate(comm, B));
2334   PetscCall(MatSetSizes(*B, n, n, n, n));
2335   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2336   (*B)->factortype = ftype;
2337   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2338   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2339   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2340 
2341   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2342     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2343     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2344     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2345     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2346     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2347   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2348     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2349     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2350     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2351     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2352   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2353 
2354   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2355   (*B)->canuseordering = PETSC_TRUE;
2356   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2357   PetscFunctionReturn(PETSC_SUCCESS);
2358 }
2359 
2360 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2361 {
2362   PetscFunctionBegin;
2363   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2364   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2365   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2366   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2367   PetscFunctionReturn(PETSC_SUCCESS);
2368 }
2369 
2370 /* Utility to print out a KokkosCsrMatrix for debugging */
2371 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2372 {
2373   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2374   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2375   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2376   const PetscInt    *i  = iv.data();
2377   const PetscInt    *j  = jv.data();
2378   const PetscScalar *a  = av.data();
2379   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2380 
2381   PetscFunctionBegin;
2382   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2383   for (PetscInt k = 0; k < m; k++) {
2384     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2385     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2386     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2387   }
2388   PetscFunctionReturn(PETSC_SUCCESS);
2389 }
2390