xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision d1c799ffc2c2dd0945dfd53da7d3f7c32cb9db4c)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20 #include <KokkosSparse_spmv.hpp>
21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22 
23 #include <KokkosSparse_spiluk.hpp>
24 #include <KokkosSparse_sptrsv.hpp>
25 #include <KokkosSparse_spgemm.hpp>
26 #include <KokkosSparse_spadd.hpp>
27 #include <KokkosBatched_LU_Decl.hpp>
28 #include <KokkosBatched_InverseLU_Decl.hpp>
29 
30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31 
32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33   #include <KokkosSparse_Utils.hpp>
34 using KokkosSparse::sort_crs_matrix;
35 using KokkosSparse::Impl::transpose_matrix;
36 #else
37   #include <KokkosKernels_Sorting.hpp>
38 using KokkosKernels::sort_crs_matrix;
39 using KokkosKernels::Impl::transpose_matrix;
40 #endif
41 
42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43 using KokkosSparse::spiluk_symbolic;
44 using KokkosSparse::spiluk_numeric;
45 using KokkosSparse::sptrsv_symbolic;
46 using KokkosSparse::sptrsv_solve;
47 using KokkosSparse::Experimental::SPTRSVAlgorithm;
48 using KokkosSparse::Experimental::SPILUKAlgorithm;
49 #else
50 using KokkosSparse::Experimental::spiluk_symbolic;
51 using KokkosSparse::Experimental::spiluk_numeric;
52 using KokkosSparse::Experimental::sptrsv_symbolic;
53 using KokkosSparse::Experimental::sptrsv_solve;
54 using KokkosSparse::Experimental::SPTRSVAlgorithm;
55 using KokkosSparse::Experimental::SPILUKAlgorithm;
56 #endif
57 
58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59 
60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62    In the latter case, it is important to set a_dual's sync state correctly.
63  */
64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65 {
66   Mat_SeqAIJ       *aijseq;
67   Mat_SeqAIJKokkos *aijkok;
68 
69   PetscFunctionBegin;
70   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72 
73   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75 
76   /* If aijkok does not exist, we just copy i, j to device.
77      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78      In both cases, we build a new aijkok structure.
79   */
80   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidently freeing much needed a,i,j on host when deleting aijkok */
82       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
83       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
84       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
85       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
86       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
87       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
88       aijseq->free_a  = PETSC_TRUE;
89       aijseq->free_ij = PETSC_TRUE;
90       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
91          we have the CsrMatrix on device first and then copy to host, followed by
92          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
93          One could improve it by not using NULL garray.
94       */
95     }
96     delete aijkok;
97     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
98     A->spptr = aijkok;
99   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
100     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
101     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
102     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
103   }
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 
107 /* Sync CSR data to device if not yet */
108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111 
112   PetscFunctionBegin;
113   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
114   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
115   if (aijkok->a_dual.need_sync_device()) {
116     aijkok->a_dual.sync_device();
117     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
118     aijkok->hermitian_updated = PETSC_FALSE;
119   }
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 /* Mark the CSR data on device as modified */
124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127 
128   PetscFunctionBegin;
129   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
130   aijkok->a_dual.clear_sync_state();
131   aijkok->a_dual.modify_device();
132   aijkok->transpose_updated = PETSC_FALSE;
133   aijkok->hermitian_updated = PETSC_FALSE;
134   PetscCall(MatSeqAIJInvalidateDiagonal(A));
135   PetscCall(PetscObjectStateIncrease((PetscObject)A));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140 {
141   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142   auto              exec   = PetscGetKokkosExecutionSpace();
143 
144   PetscFunctionBegin;
145   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
146   /* We do not expect one needs factors on host  */
147   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
148   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149   PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(aijkok->a_dual, exec));
150   PetscFunctionReturn(PETSC_SUCCESS);
151 }
152 
153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154 {
155   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
156 
157   PetscFunctionBegin;
158   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
159     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
160     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
161     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
162   */
163   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164     auto exec = PetscGetKokkosExecutionSpace();
165     PetscCallCXX(aijkok->a_dual.sync_host(exec));
166     PetscCallCXX(exec.fence());
167     *array = aijkok->a_dual.view_host().data();
168   } else { /* Happens when calling MatSetValues on a newly created matrix */
169     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
170   }
171   PetscFunctionReturn(PETSC_SUCCESS);
172 }
173 
174 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
175 {
176   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
177 
178   PetscFunctionBegin;
179   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
180   PetscFunctionReturn(PETSC_SUCCESS);
181 }
182 
183 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
184 {
185   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
186 
187   PetscFunctionBegin;
188   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
189     auto exec = PetscGetKokkosExecutionSpace();
190     PetscCallCXX(aijkok->a_dual.sync_host(exec));
191     PetscCallCXX(exec.fence());
192     *array = aijkok->a_dual.view_host().data();
193   } else {
194     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
195   }
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
200 {
201   PetscFunctionBegin;
202   *array = NULL;
203   PetscFunctionReturn(PETSC_SUCCESS);
204 }
205 
206 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
207 {
208   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
209 
210   PetscFunctionBegin;
211   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
212     *array = aijkok->a_dual.view_host().data();
213   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
214     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
215   }
216   PetscFunctionReturn(PETSC_SUCCESS);
217 }
218 
219 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
220 {
221   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
222 
223   PetscFunctionBegin;
224   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
225     aijkok->a_dual.clear_sync_state();
226     aijkok->a_dual.modify_host();
227   }
228   PetscFunctionReturn(PETSC_SUCCESS);
229 }
230 
231 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
232 {
233   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
234 
235   PetscFunctionBegin;
236   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
237 
238   if (i) *i = aijkok->i_device_data();
239   if (j) *j = aijkok->j_device_data();
240   if (a) {
241     aijkok->a_dual.sync_device();
242     *a = aijkok->a_device_data();
243   }
244   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
245   PetscFunctionReturn(PETSC_SUCCESS);
246 }
247 
248 /*
249   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
250 
251   Input Parameter:
252 .  A       - the MATSEQAIJKOKKOS matrix
253 
254   Output Parameters:
255 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
256 -  T_d    - the transpose on device, whose value array is allocated but not initialized
257 */
258 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
259 {
260   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
261   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
262   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
263   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
264   MatRowMapType          *Ti = Ti_h.data();
265   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
266   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
267   PetscInt               *Tj   = Tj_h.data();
268   PetscInt               *perm = perm_h.data();
269   PetscInt               *offset;
270 
271   PetscFunctionBegin;
272   // Populate Ti
273   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
274   Ti++;
275   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
276   Ti--;
277   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
278 
279   // Populate Tj and the permutation array
280   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
281   for (PetscInt i = 0; i < m; i++) {
282     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
283       PetscInt r    = Aj[j];                       // row r of T
284       PetscInt disp = Ti[r] + offset[r];
285 
286       Tj[disp]   = i; // col i of T
287       perm[disp] = j;
288       offset[r]++;
289     }
290   }
291   PetscCall(PetscFree(offset));
292 
293   // Sort each row of T, along with the permutation array
294   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
295 
296   // Output perm and T on device
297   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
298   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
299   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
300   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
301   PetscFunctionReturn(PETSC_SUCCESS);
302 }
303 
304 // Generate the transpose on device and cache it internally
305 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
306 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
307 {
308   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
309   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
310   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
311   KokkosCsrMatrix  &T = akok->csrmatT;
312 
313   PetscFunctionBegin;
314   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
315   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
316 
317   const auto &Aa = akok->a_dual.view_device();
318 
319   if (A->symmetric == PETSC_BOOL3_TRUE) {
320     *csrmatT = akok->csrmat;
321   } else {
322     // See if we already have a cached transpose and its value is up to date
323     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
324       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
325         const auto &perm = akok->transpose_perm; // get the permutation array
326         auto       &Ta   = T.values;
327 
328         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
329       }
330     } else { // Generate T of size n x m for the first time
331       MatRowMapKokkosView perm;
332 
333       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
334       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
335       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
336     }
337     akok->transpose_updated = PETSC_TRUE;
338     *csrmatT                = akok->csrmatT;
339   }
340   PetscFunctionReturn(PETSC_SUCCESS);
341 }
342 
343 // Generate the Hermitian on device and cache it internally
344 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
345 {
346   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
347   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
348   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
349   KokkosCsrMatrix  &T = akok->csrmatH;
350 
351   PetscFunctionBegin;
352   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
353   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
354 
355   const auto &Aa = akok->a_dual.view_device();
356 
357   if (A->hermitian == PETSC_BOOL3_TRUE) {
358     *csrmatH = akok->csrmat;
359   } else {
360     // See if we already have a cached hermitian and its value is up to date
361     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
362       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
363         const auto &perm = akok->transpose_perm; // get the permutation array
364         auto       &Ta   = T.values;
365 
366         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
367       }
368     } else { // Generate T of size n x m for the first time
369       MatRowMapKokkosView perm;
370 
371       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
372       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
373       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
374     }
375     akok->hermitian_updated = PETSC_TRUE;
376     *csrmatH                = akok->csrmatH;
377   }
378   PetscFunctionReturn(PETSC_SUCCESS);
379 }
380 
381 /* y = A x */
382 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
383 {
384   Mat_SeqAIJKokkos          *aijkok;
385   ConstPetscScalarKokkosView xv;
386   PetscScalarKokkosView      yv;
387 
388   PetscFunctionBegin;
389   PetscCall(PetscLogGpuTimeBegin());
390   PetscCall(MatSeqAIJKokkosSyncDevice(A));
391   PetscCall(VecGetKokkosView(xx, &xv));
392   PetscCall(VecGetKokkosViewWrite(yy, &yv));
393   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
394   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
395   PetscCall(VecRestoreKokkosView(xx, &xv));
396   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
397   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
398   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
399   PetscCall(PetscLogGpuTimeEnd());
400   PetscFunctionReturn(PETSC_SUCCESS);
401 }
402 
403 /* y = A^T x */
404 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
405 {
406   Mat_SeqAIJKokkos          *aijkok;
407   const char                *mode;
408   ConstPetscScalarKokkosView xv;
409   PetscScalarKokkosView      yv;
410   KokkosCsrMatrix            csrmat;
411 
412   PetscFunctionBegin;
413   PetscCall(PetscLogGpuTimeBegin());
414   PetscCall(MatSeqAIJKokkosSyncDevice(A));
415   PetscCall(VecGetKokkosView(xx, &xv));
416   PetscCall(VecGetKokkosViewWrite(yy, &yv));
417   if (A->form_explicit_transpose) {
418     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
419     mode = "N";
420   } else {
421     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
422     csrmat = aijkok->csrmat;
423     mode   = "T";
424   }
425   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
426   PetscCall(VecRestoreKokkosView(xx, &xv));
427   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
428   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
429   PetscCall(PetscLogGpuTimeEnd());
430   PetscFunctionReturn(PETSC_SUCCESS);
431 }
432 
433 /* y = A^H x */
434 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
435 {
436   Mat_SeqAIJKokkos          *aijkok;
437   const char                *mode;
438   ConstPetscScalarKokkosView xv;
439   PetscScalarKokkosView      yv;
440   KokkosCsrMatrix            csrmat;
441 
442   PetscFunctionBegin;
443   PetscCall(PetscLogGpuTimeBegin());
444   PetscCall(MatSeqAIJKokkosSyncDevice(A));
445   PetscCall(VecGetKokkosView(xx, &xv));
446   PetscCall(VecGetKokkosViewWrite(yy, &yv));
447   if (A->form_explicit_transpose) {
448     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
449     mode = "N";
450   } else {
451     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
452     csrmat = aijkok->csrmat;
453     mode   = "C";
454   }
455   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
456   PetscCall(VecRestoreKokkosView(xx, &xv));
457   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
458   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
459   PetscCall(PetscLogGpuTimeEnd());
460   PetscFunctionReturn(PETSC_SUCCESS);
461 }
462 
463 /* z = A x + y */
464 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
465 {
466   Mat_SeqAIJKokkos          *aijkok;
467   ConstPetscScalarKokkosView xv;
468   PetscScalarKokkosView      zv;
469 
470   PetscFunctionBegin;
471   PetscCall(PetscLogGpuTimeBegin());
472   PetscCall(MatSeqAIJKokkosSyncDevice(A));
473   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
474   PetscCall(VecGetKokkosView(xx, &xv));
475   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
476   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
477   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
478   PetscCall(VecRestoreKokkosView(xx, &xv));
479   PetscCall(VecRestoreKokkosView(zz, &zv));
480   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
481   PetscCall(PetscLogGpuTimeEnd());
482   PetscFunctionReturn(PETSC_SUCCESS);
483 }
484 
485 /* z = A^T x + y */
486 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
487 {
488   Mat_SeqAIJKokkos          *aijkok;
489   const char                *mode;
490   ConstPetscScalarKokkosView xv;
491   PetscScalarKokkosView      zv;
492   KokkosCsrMatrix            csrmat;
493 
494   PetscFunctionBegin;
495   PetscCall(PetscLogGpuTimeBegin());
496   PetscCall(MatSeqAIJKokkosSyncDevice(A));
497   if (zz != yy) PetscCall(VecCopy(yy, zz));
498   PetscCall(VecGetKokkosView(xx, &xv));
499   PetscCall(VecGetKokkosView(zz, &zv));
500   if (A->form_explicit_transpose) {
501     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
502     mode = "N";
503   } else {
504     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
505     csrmat = aijkok->csrmat;
506     mode   = "T";
507   }
508   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
509   PetscCall(VecRestoreKokkosView(xx, &xv));
510   PetscCall(VecRestoreKokkosView(zz, &zv));
511   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
512   PetscCall(PetscLogGpuTimeEnd());
513   PetscFunctionReturn(PETSC_SUCCESS);
514 }
515 
516 /* z = A^H x + y */
517 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
518 {
519   Mat_SeqAIJKokkos          *aijkok;
520   const char                *mode;
521   ConstPetscScalarKokkosView xv;
522   PetscScalarKokkosView      zv;
523   KokkosCsrMatrix            csrmat;
524 
525   PetscFunctionBegin;
526   PetscCall(PetscLogGpuTimeBegin());
527   PetscCall(MatSeqAIJKokkosSyncDevice(A));
528   if (zz != yy) PetscCall(VecCopy(yy, zz));
529   PetscCall(VecGetKokkosView(xx, &xv));
530   PetscCall(VecGetKokkosView(zz, &zv));
531   if (A->form_explicit_transpose) {
532     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
533     mode = "N";
534   } else {
535     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
536     csrmat = aijkok->csrmat;
537     mode   = "C";
538   }
539   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
540   PetscCall(VecRestoreKokkosView(xx, &xv));
541   PetscCall(VecRestoreKokkosView(zz, &zv));
542   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
543   PetscCall(PetscLogGpuTimeEnd());
544   PetscFunctionReturn(PETSC_SUCCESS);
545 }
546 
547 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
548 {
549   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
550 
551   PetscFunctionBegin;
552   switch (op) {
553   case MAT_FORM_EXPLICIT_TRANSPOSE:
554     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
555     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
556     A->form_explicit_transpose = flg;
557     break;
558   default:
559     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
560     break;
561   }
562   PetscFunctionReturn(PETSC_SUCCESS);
563 }
564 
565 /* Depending on reuse, either build a new mat, or use the existing mat */
566 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
567 {
568   Mat_SeqAIJ *aseq;
569 
570   PetscFunctionBegin;
571   PetscCall(PetscKokkosInitializeCheck());
572   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
573     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
574     PetscCall(MatSetType(*newmat, mtype));
575   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
576     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
577   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
578     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
579     PetscCall(PetscFree(A->defaultvectype));
580     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
581     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
582     PetscCall(MatSetOps_SeqAIJKokkos(A));
583     aseq = static_cast<Mat_SeqAIJ *>(A->data);
584     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
585       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
586       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
587     }
588   }
589   PetscFunctionReturn(PETSC_SUCCESS);
590 }
591 
592 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
593    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
594  */
595 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
596 {
597   Mat_SeqAIJ       *bseq;
598   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
599   Mat               mat;
600 
601   PetscFunctionBegin;
602   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
603   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
604   mat = *B;
605   if (A->assembled) {
606     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
607     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
608     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
609     /* Now copy values to B if needed */
610     if (dupOption == MAT_COPY_VALUES) {
611       if (akok->a_dual.need_sync_device()) {
612         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
613         bkok->a_dual.modify_host();
614       } else { /* If device has the latest data, we only copy data on device */
615         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
616         bkok->a_dual.modify_device();
617       }
618     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
619       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
620       bkok->a_dual.modify_host();
621     }
622     mat->spptr = bkok;
623   }
624 
625   PetscCall(PetscFree(mat->defaultvectype));
626   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
627   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
628   PetscCall(MatSetOps_SeqAIJKokkos(mat));
629   PetscFunctionReturn(PETSC_SUCCESS);
630 }
631 
632 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
633 {
634   Mat               At;
635   KokkosCsrMatrix   internT;
636   Mat_SeqAIJKokkos *atkok, *bkok;
637 
638   PetscFunctionBegin;
639   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
640   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
641   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
642     /* Deep copy internT, as we want to isolate the internal transpose */
643     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
644     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
645     if (reuse == MAT_INITIAL_MATRIX) *B = At;
646     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
647   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
648     if ((*B)->assembled) {
649       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
650       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
651       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
652     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
653       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
654       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
655       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
656       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
657       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
658     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
659   }
660   PetscFunctionReturn(PETSC_SUCCESS);
661 }
662 
663 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
664 {
665   Mat_SeqAIJKokkos *aijkok;
666 
667   PetscFunctionBegin;
668   if (A->factortype == MAT_FACTOR_NONE) {
669     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
670     delete aijkok;
671   } else {
672     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
673   }
674   A->spptr = NULL;
675   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
676   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
677   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
678 #if defined(PETSC_HAVE_HYPRE)
679   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
680 #endif
681   PetscCall(MatDestroy_SeqAIJ(A));
682   PetscFunctionReturn(PETSC_SUCCESS);
683 }
684 
685 /*MC
686    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
687 
688    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
689 
690    Options Database Key:
691 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
692 
693   Level: beginner
694 
695 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
696 M*/
697 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
698 {
699   PetscFunctionBegin;
700   PetscCall(PetscKokkosInitializeCheck());
701   PetscCall(MatCreate_SeqAIJ(A));
702   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
703   PetscFunctionReturn(PETSC_SUCCESS);
704 }
705 
706 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
707 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
708 {
709   Mat_SeqAIJ         *a, *b;
710   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
711   MatScalarKokkosView aa, ba, ca;
712   MatRowMapKokkosView ai, bi, ci;
713   MatColIdxKokkosView aj, bj, cj;
714   PetscInt            m, n, nnz, aN;
715 
716   PetscFunctionBegin;
717   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
718   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
719   PetscAssertPointer(C, 4);
720   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
721   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
722   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
723   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
724 
725   PetscCall(MatSeqAIJKokkosSyncDevice(A));
726   PetscCall(MatSeqAIJKokkosSyncDevice(B));
727   a    = static_cast<Mat_SeqAIJ *>(A->data);
728   b    = static_cast<Mat_SeqAIJ *>(B->data);
729   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
730   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
731   aa   = akok->a_dual.view_device();
732   ai   = akok->i_dual.view_device();
733   ba   = bkok->a_dual.view_device();
734   bi   = bkok->i_dual.view_device();
735   m    = A->rmap->n; /* M, N and nnz of C */
736   n    = A->cmap->n + B->cmap->n;
737   nnz  = a->nz + b->nz;
738   aN   = A->cmap->n; /* N of A */
739   if (reuse == MAT_INITIAL_MATRIX) {
740     aj           = akok->j_dual.view_device();
741     bj           = bkok->j_dual.view_device();
742     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
743     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
744     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
745     ca           = ca_dual.view_device();
746     ci           = ci_dual.view_device();
747     cj           = cj_dual.view_device();
748 
749     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
750     Kokkos::parallel_for(
751       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
752         PetscInt i       = t.league_rank(); /* row i */
753         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
754 
755         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
756                                                    ci(i) = coffset;
757                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
758         });
759 
760         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
761           if (k < alen) {
762             ca(coffset + k) = aa(ai(i) + k);
763             cj(coffset + k) = aj(ai(i) + k);
764           } else {
765             ca(coffset + k) = ba(bi(i) + k - alen);
766             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
767           }
768         });
769       });
770     ca_dual.modify_device();
771     ci_dual.modify_device();
772     cj_dual.modify_device();
773     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
774     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
775   } else if (reuse == MAT_REUSE_MATRIX) {
776     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
777     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
778     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
779     ca   = ckok->a_dual.view_device();
780     ci   = ckok->i_dual.view_device();
781 
782     Kokkos::parallel_for(
783       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
784         PetscInt i    = t.league_rank(); /* row i */
785         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
786         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
787           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
788           else ca(ci(i) + k) = ba(bi(i) + k - alen);
789         });
790       });
791     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
792   }
793   PetscFunctionReturn(PETSC_SUCCESS);
794 }
795 
796 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
797 {
798   PetscFunctionBegin;
799   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
800   PetscFunctionReturn(PETSC_SUCCESS);
801 }
802 
803 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
804 {
805   Mat_Product                 *product = C->product;
806   Mat                          A, B;
807   bool                         transA, transB; /* use bool, since KK needs this type */
808   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
809   Mat_SeqAIJ                  *c;
810   MatProductData_SeqAIJKokkos *pdata;
811   KokkosCsrMatrix              csrmatA, csrmatB;
812 
813   PetscFunctionBegin;
814   MatCheckProduct(C, 1);
815   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
816   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
817 
818   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
819   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
820   // we still do numeric.
821   if (pdata->reusesym) { // numeric reuses results from symbolic
822     pdata->reusesym = PETSC_FALSE;
823     PetscFunctionReturn(PETSC_SUCCESS);
824   }
825 
826   switch (product->type) {
827   case MATPRODUCT_AB:
828     transA = false;
829     transB = false;
830     break;
831   case MATPRODUCT_AtB:
832     transA = true;
833     transB = false;
834     break;
835   case MATPRODUCT_ABt:
836     transA = false;
837     transB = true;
838     break;
839   default:
840     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
841   }
842 
843   A = product->A;
844   B = product->B;
845   PetscCall(MatSeqAIJKokkosSyncDevice(A));
846   PetscCall(MatSeqAIJKokkosSyncDevice(B));
847   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
848   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
849   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
850 
851   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
852 
853   csrmatA = akok->csrmat;
854   csrmatB = bkok->csrmat;
855 
856   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
857   if (transA) {
858     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
859     transA = false;
860   }
861 
862   if (transB) {
863     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
864     transB = false;
865   }
866   PetscCall(PetscLogGpuTimeBegin());
867   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
868 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
869   auto spgemmHandle = pdata->kh.get_spgemm_handle();
870   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
871 #endif
872 
873   PetscCall(PetscLogGpuTimeEnd());
874   PetscCall(MatSeqAIJKokkosModifyDevice(C));
875   /* shorter version of MatAssemblyEnd_SeqAIJ */
876   c = (Mat_SeqAIJ *)C->data;
877   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
878   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
879   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
880   c->reallocs         = 0;
881   C->info.mallocs     = 0;
882   C->info.nz_unneeded = 0;
883   C->assembled = C->was_assembled = PETSC_TRUE;
884   C->num_ass++;
885   PetscFunctionReturn(PETSC_SUCCESS);
886 }
887 
888 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
889 {
890   Mat_Product                 *product = C->product;
891   MatProductType               ptype;
892   Mat                          A, B;
893   bool                         transA, transB;
894   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
895   MatProductData_SeqAIJKokkos *pdata;
896   MPI_Comm                     comm;
897   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
898 
899   PetscFunctionBegin;
900   MatCheckProduct(C, 1);
901   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
902   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
903   A = product->A;
904   B = product->B;
905   PetscCall(MatSeqAIJKokkosSyncDevice(A));
906   PetscCall(MatSeqAIJKokkosSyncDevice(B));
907   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
908   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
909   csrmatA = akok->csrmat;
910   csrmatB = bkok->csrmat;
911 
912   ptype = product->type;
913   // Take advantage of the symmetry if true
914   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
915     ptype                                          = MATPRODUCT_AB;
916     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
917   }
918   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
919     ptype                                          = MATPRODUCT_AB;
920     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
921   }
922 
923   switch (ptype) {
924   case MATPRODUCT_AB:
925     transA = false;
926     transB = false;
927     PetscCall(MatSetBlockSizesFromMats(C, A, B));
928     break;
929   case MATPRODUCT_AtB:
930     transA = true;
931     transB = false;
932     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
933     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
934     break;
935   case MATPRODUCT_ABt:
936     transA = false;
937     transB = true;
938     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
939     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
940     break;
941   default:
942     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
943   }
944   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
945   pdata->reusesym = product->api_user;
946 
947   /* TODO: add command line options to select spgemm algorithms */
948   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
949 
950   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
951 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
952   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
953   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
954   #endif
955 #endif
956   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
957 
958   PetscCall(PetscLogGpuTimeBegin());
959   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
960   if (transA) {
961     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
962     transA = false;
963   }
964 
965   if (transB) {
966     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
967     transB = false;
968   }
969 
970   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
971   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
972     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
973     calling new Mat_SeqAIJKokkos().
974     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
975   */
976   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
977 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
978   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
979   auto spgemmHandle = pdata->kh.get_spgemm_handle();
980   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
981 #endif
982   PetscCall(PetscLogGpuTimeEnd());
983 
984   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
985   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
986   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
987   PetscFunctionReturn(PETSC_SUCCESS);
988 }
989 
990 /* handles sparse matrix matrix ops */
991 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
992 {
993   Mat_Product *product = mat->product;
994   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
995 
996   PetscFunctionBegin;
997   MatCheckProduct(mat, 1);
998   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
999   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
1000   if (Biskok && Ciskok) {
1001     switch (product->type) {
1002     case MATPRODUCT_AB:
1003     case MATPRODUCT_AtB:
1004     case MATPRODUCT_ABt:
1005       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1006       break;
1007     case MATPRODUCT_PtAP:
1008     case MATPRODUCT_RARt:
1009     case MATPRODUCT_ABC:
1010       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1011       break;
1012     default:
1013       break;
1014     }
1015   } else { /* fallback for AIJ */
1016     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1017   }
1018   PetscFunctionReturn(PETSC_SUCCESS);
1019 }
1020 
1021 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1022 {
1023   Mat_SeqAIJKokkos *aijkok;
1024 
1025   PetscFunctionBegin;
1026   PetscCall(PetscLogGpuTimeBegin());
1027   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1028   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1029   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1030   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1031   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1032   PetscCall(PetscLogGpuTimeEnd());
1033   PetscFunctionReturn(PETSC_SUCCESS);
1034 }
1035 
1036 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1037 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1038 {
1039   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1040 
1041   PetscFunctionBegin;
1042   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1043     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1044 
1045     PetscCall(PetscLogGpuTimeBegin());
1046     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1047     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1048     const auto &Aa     = aijkok->a_dual.view_device();
1049     const auto &Adiag  = aijkok->diag_dual.view_device();
1050     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1051     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1052     PetscCall(PetscLogGpuFlops(n));
1053     PetscCall(PetscLogGpuTimeEnd());
1054   } else { // need reassembly, very slow!
1055     PetscCall(MatShift_Basic(A, a));
1056   }
1057   PetscFunctionReturn(PETSC_SUCCESS);
1058 }
1059 
1060 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1061 {
1062   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1063 
1064   PetscFunctionBegin;
1065   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1066     ConstPetscScalarKokkosView dv;
1067     PetscInt                   n, nv;
1068 
1069     PetscCall(PetscLogGpuTimeBegin());
1070     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1071     PetscCall(VecGetKokkosView(D, &dv));
1072     PetscCall(VecGetLocalSize(D, &nv));
1073     n = PetscMin(Y->rmap->n, Y->cmap->n);
1074     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1075 
1076     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1077     const auto &Aa     = aijkok->a_dual.view_device();
1078     const auto &Adiag  = aijkok->diag_dual.view_device();
1079     PetscCallCXX(Kokkos::parallel_for(
1080       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1081         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1082         else Aa(Adiag(i)) += dv(i);
1083       }));
1084     PetscCall(VecRestoreKokkosView(D, &dv));
1085     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1086     PetscCall(PetscLogGpuFlops(n));
1087     PetscCall(PetscLogGpuTimeEnd());
1088   } else { // need reassembly, very slow!
1089     PetscCall(MatDiagonalSet_Default(Y, D, is));
1090   }
1091   PetscFunctionReturn(PETSC_SUCCESS);
1092 }
1093 
1094 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1095 {
1096   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1097   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1098   ConstPetscScalarKokkosView lv, rv;
1099 
1100   PetscFunctionBegin;
1101   PetscCall(PetscLogGpuTimeBegin());
1102   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1103   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1104   const auto &Aa     = aijkok->a_dual.view_device();
1105   const auto &Ai     = aijkok->i_dual.view_device();
1106   const auto &Aj     = aijkok->j_dual.view_device();
1107   if (ll) {
1108     PetscCall(VecGetLocalSize(ll, &m));
1109     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1110     PetscCall(VecGetKokkosView(ll, &lv));
1111     PetscCallCXX(Kokkos::parallel_for( // for each row
1112       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1113         PetscInt i   = t.league_rank(); // row i
1114         PetscInt len = Ai(i + 1) - Ai(i);
1115         // scale entries on the row
1116         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1117       }));
1118     PetscCall(VecRestoreKokkosView(ll, &lv));
1119     PetscCall(PetscLogGpuFlops(nz));
1120   }
1121   if (rr) {
1122     PetscCall(VecGetLocalSize(rr, &n));
1123     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1124     PetscCall(VecGetKokkosView(rr, &rv));
1125     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1126       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1127     PetscCall(VecRestoreKokkosView(rr, &lv));
1128     PetscCall(PetscLogGpuFlops(nz));
1129   }
1130   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1131   PetscCall(PetscLogGpuTimeEnd());
1132   PetscFunctionReturn(PETSC_SUCCESS);
1133 }
1134 
1135 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1136 {
1137   Mat_SeqAIJKokkos *aijkok;
1138 
1139   PetscFunctionBegin;
1140   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1141   if (aijkok) { /* Only zero the device if data is already there */
1142     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1143     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1144   } else { /* Might be preallocated but not assembled */
1145     PetscCall(MatZeroEntries_SeqAIJ(A));
1146   }
1147   PetscFunctionReturn(PETSC_SUCCESS);
1148 }
1149 
1150 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1151 {
1152   Mat_SeqAIJKokkos     *aijkok;
1153   PetscInt              n;
1154   PetscScalarKokkosView xv;
1155 
1156   PetscFunctionBegin;
1157   PetscCall(VecGetLocalSize(x, &n));
1158   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1159   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1160 
1161   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1162   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1163 
1164   const auto &Aa    = aijkok->a_dual.view_device();
1165   const auto &Ai    = aijkok->i_dual.view_device();
1166   const auto &Adiag = aijkok->diag_dual.view_device();
1167 
1168   PetscCall(VecGetKokkosViewWrite(x, &xv));
1169   Kokkos::parallel_for(
1170     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1171       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1172       else xv(i) = 0;
1173     });
1174   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1175   PetscFunctionReturn(PETSC_SUCCESS);
1176 }
1177 
1178 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1179 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1180 {
1181   Mat_SeqAIJKokkos *aijkok;
1182 
1183   PetscFunctionBegin;
1184   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1185   PetscAssertPointer(kv, 2);
1186   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1187   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1188   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1189   *kv    = aijkok->a_dual.view_device();
1190   PetscFunctionReturn(PETSC_SUCCESS);
1191 }
1192 
1193 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1194 {
1195   PetscFunctionBegin;
1196   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1197   PetscAssertPointer(kv, 2);
1198   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1199   PetscFunctionReturn(PETSC_SUCCESS);
1200 }
1201 
1202 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1203 {
1204   Mat_SeqAIJKokkos *aijkok;
1205 
1206   PetscFunctionBegin;
1207   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1208   PetscAssertPointer(kv, 2);
1209   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1210   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1211   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1212   *kv    = aijkok->a_dual.view_device();
1213   PetscFunctionReturn(PETSC_SUCCESS);
1214 }
1215 
1216 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1217 {
1218   PetscFunctionBegin;
1219   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1220   PetscAssertPointer(kv, 2);
1221   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1222   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1223   PetscFunctionReturn(PETSC_SUCCESS);
1224 }
1225 
1226 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1227 {
1228   Mat_SeqAIJKokkos *aijkok;
1229 
1230   PetscFunctionBegin;
1231   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1232   PetscAssertPointer(kv, 2);
1233   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1234   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1235   *kv    = aijkok->a_dual.view_device();
1236   PetscFunctionReturn(PETSC_SUCCESS);
1237 }
1238 
1239 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1240 {
1241   PetscFunctionBegin;
1242   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1243   PetscAssertPointer(kv, 2);
1244   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1245   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1246   PetscFunctionReturn(PETSC_SUCCESS);
1247 }
1248 
1249 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1250 {
1251   Mat_SeqAIJKokkos *akok;
1252 
1253   PetscFunctionBegin;
1254   auto exec = PetscGetKokkosExecutionSpace();
1255   // Create host copies of the input aij
1256   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1257   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1258   // Don't copy the vals to the host now
1259   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1260 
1261   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1262   // Note we have modified device data so it will copy lazily
1263   a_dual.modify_device();
1264   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1265   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1266 
1267   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1268   PetscCall(MatCreate(comm, A));
1269   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1270   PetscFunctionReturn(PETSC_SUCCESS);
1271 }
1272 
1273 /* Computes Y += alpha X */
1274 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1275 {
1276   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1277   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1278   ConstMatScalarKokkosView Xa;
1279   MatScalarKokkosView      Ya;
1280   auto                     exec = PetscGetKokkosExecutionSpace();
1281 
1282   PetscFunctionBegin;
1283   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1284   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1285   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1286   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1287   PetscCall(PetscLogGpuTimeBegin());
1288 
1289   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1290     PetscBool e;
1291     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1292     if (e) {
1293       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1294       if (e) pattern = SAME_NONZERO_PATTERN;
1295     }
1296   }
1297 
1298   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1299     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1300     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1301     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1302   */
1303   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1304   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1305   Xa   = xkok->a_dual.view_device();
1306   Ya   = ykok->a_dual.view_device();
1307 
1308   if (pattern == SAME_NONZERO_PATTERN) {
1309     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1310     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1311   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1312     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1313     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1314 
1315     Kokkos::parallel_for(
1316       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1317         PetscInt i = t.league_rank(); // row i
1318         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1319           // Only one thread works in a team
1320           PetscInt p, q = Yi(i);
1321           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1322             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1323             if (Xj(p) == Yj(q)) {                        // Found it
1324               Ya(q) += alpha * Xa(p);
1325               q++;
1326             } else {
1327             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1328             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1329 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1330               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1331 #else
1332               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1333 #endif
1334             }
1335           }
1336         });
1337       });
1338     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1339   } else { // different nonzero patterns
1340     Mat             Z;
1341     KokkosCsrMatrix zcsr;
1342     KernelHandle    kh;
1343     kh.create_spadd_handle(true); // X, Y are sorted
1344     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1345     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1346     zkok = new Mat_SeqAIJKokkos(zcsr);
1347     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1348     PetscCall(MatHeaderReplace(Y, &Z));
1349     kh.destroy_spadd_handle();
1350   }
1351   PetscCall(PetscLogGpuTimeEnd());
1352   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1353   PetscFunctionReturn(PETSC_SUCCESS);
1354 }
1355 
1356 struct MatCOOStruct_SeqAIJKokkos {
1357   PetscCount           n;
1358   PetscCount           Atot;
1359   PetscInt             nz;
1360   PetscCountKokkosView jmap;
1361   PetscCountKokkosView perm;
1362 
1363   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1364   {
1365     nz   = coo_h->nz;
1366     n    = coo_h->n;
1367     Atot = coo_h->Atot;
1368     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1369     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1370   }
1371 };
1372 
1373 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1374 {
1375   PetscFunctionBegin;
1376   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1377   PetscFunctionReturn(PETSC_SUCCESS);
1378 }
1379 
1380 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1381 {
1382   Mat_SeqAIJKokkos          *akok;
1383   Mat_SeqAIJ                *aseq;
1384   PetscContainer             container_h;
1385   MatCOOStruct_SeqAIJ       *coo_h;
1386   MatCOOStruct_SeqAIJKokkos *coo_d;
1387 
1388   PetscFunctionBegin;
1389   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1390   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1391   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1392   delete akok;
1393   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1394   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1395 
1396   // Copy the COO struct to device
1397   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1398   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1399   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1400 
1401   // Put the COO struct in a container and then attach that to the matrix
1402   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1403   PetscFunctionReturn(PETSC_SUCCESS);
1404 }
1405 
1406 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1407 {
1408   MatScalarKokkosView        Aa;
1409   ConstMatScalarKokkosView   kv;
1410   PetscMemType               memtype;
1411   PetscContainer             container;
1412   MatCOOStruct_SeqAIJKokkos *coo;
1413 
1414   PetscFunctionBegin;
1415   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1416   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1417 
1418   const auto &n    = coo->n;
1419   const auto &Annz = coo->nz;
1420   const auto &jmap = coo->jmap;
1421   const auto &perm = coo->perm;
1422 
1423   PetscCall(PetscGetMemType(v, &memtype));
1424   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1425     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1426   } else {
1427     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1428   }
1429 
1430   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1431   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1432 
1433   PetscCall(PetscLogGpuTimeBegin());
1434   Kokkos::parallel_for(
1435     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1436       PetscScalar sum = 0.0;
1437       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1438       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1439     });
1440   PetscCall(PetscLogGpuTimeEnd());
1441 
1442   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1443   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1444   PetscFunctionReturn(PETSC_SUCCESS);
1445 }
1446 
1447 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1448 {
1449   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1450 
1451   PetscFunctionBegin;
1452   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1453   A->boundtocpu  = PETSC_FALSE;
1454 
1455   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1456   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1457   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1458   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1459   A->ops->scale                     = MatScale_SeqAIJKokkos;
1460   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1461   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1462   A->ops->mult                      = MatMult_SeqAIJKokkos;
1463   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1464   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1465   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1466   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1467   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1468   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1469   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1470   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1471   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1472   A->ops->shift                     = MatShift_SeqAIJKokkos;
1473   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1474   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1475   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1476   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1477   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1478   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1479   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1480   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1481   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1482 
1483   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1484   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1485 #if defined(PETSC_HAVE_HYPRE)
1486   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1487 #endif
1488   PetscFunctionReturn(PETSC_SUCCESS);
1489 }
1490 
1491 /*
1492    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1493 
1494   Input Parameters:
1495 +  A       - the MATSEQAIJKOKKOS matrix
1496 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1497 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1498 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1499 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1500 
1501   Output Parameter:
1502 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1503 */
1504 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1505 {
1506   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1507   PetscInt          nblocks = bs.extent(0) - 1;
1508 
1509   PetscFunctionBegin;
1510   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1511 
1512   // Pull out the diagonal blocks of the matrix and then invert the blocks
1513   auto Aa    = akok->a_dual.view_device();
1514   auto Ai    = akok->i_dual.view_device();
1515   auto Aj    = akok->j_dual.view_device();
1516   auto Adiag = akok->diag_dual.view_device();
1517   // TODO: how to tune the team size?
1518 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1519   auto ts = Kokkos::AUTO();
1520 #else
1521   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1522 #endif
1523   PetscCallCXX(Kokkos::parallel_for(
1524     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1525       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1526       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1527       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1528       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1529       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1530 
1531       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1532         PetscInt i = rstart + r;                                                            // i-th row in A
1533 
1534         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1535           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1536 
1537           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1538             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1539               B(r, c) = 0.0;
1540             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1541               B(r, c) = Aa(first + c);
1542             } else { // this entry does not show up in the CSR
1543               B(r, c) = 0.0;
1544             }
1545           }
1546         } else { // rare case that the diagonal does not exist
1547           const PetscInt begin = Ai(i);
1548           const PetscInt end   = Ai(i + 1);
1549           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1550           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1551             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1552             else if (Aj(j) >= rstart + m) break;
1553           }
1554         }
1555       });
1556 
1557       // LU-decompose B (w/o pivoting) and then invert B
1558       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1559       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1560     }));
1561   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1562   PetscFunctionReturn(PETSC_SUCCESS);
1563 }
1564 
1565 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1566 {
1567   Mat_SeqAIJ *aseq;
1568   PetscInt    i, m, n;
1569   auto        exec = PetscGetKokkosExecutionSpace();
1570 
1571   PetscFunctionBegin;
1572   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1573 
1574   m = akok->nrows();
1575   n = akok->ncols();
1576   PetscCall(MatSetSizes(A, m, n, m, n));
1577   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1578 
1579   /* Set up data structures of A as a MATSEQAIJ */
1580   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1581   aseq = (Mat_SeqAIJ *)A->data;
1582 
1583   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1584   PetscCallCXX(akok->j_dual.sync_host(exec));
1585   PetscCallCXX(exec.fence());
1586 
1587   aseq->i       = akok->i_host_data();
1588   aseq->j       = akok->j_host_data();
1589   aseq->a       = akok->a_host_data();
1590   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1591   aseq->free_a  = PETSC_FALSE;
1592   aseq->free_ij = PETSC_FALSE;
1593   aseq->nz      = akok->nnz();
1594   aseq->maxnz   = aseq->nz;
1595 
1596   PetscCall(PetscMalloc1(m, &aseq->imax));
1597   PetscCall(PetscMalloc1(m, &aseq->ilen));
1598   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1599 
1600   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1601   akok->nonzerostate = A->nonzerostate;
1602   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1603   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1604   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1605   PetscFunctionReturn(PETSC_SUCCESS);
1606 }
1607 
1608 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1609 {
1610   PetscFunctionBegin;
1611   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1612   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1613   PetscFunctionReturn(PETSC_SUCCESS);
1614 }
1615 
1616 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1617 {
1618   Mat_SeqAIJKokkos *akok;
1619 
1620   PetscFunctionBegin;
1621   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1622   PetscCall(MatCreate(comm, A));
1623   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1624   PetscFunctionReturn(PETSC_SUCCESS);
1625 }
1626 
1627 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1628 
1629    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1630  */
1631 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1632 {
1633   PetscFunctionBegin;
1634   PetscCall(MatCreate(comm, A));
1635   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1636   PetscFunctionReturn(PETSC_SUCCESS);
1637 }
1638 
1639 /*@C
1640   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1641   (the default parallel PETSc format). This matrix will ultimately be handled by
1642   Kokkos for calculations.
1643 
1644   Collective
1645 
1646   Input Parameters:
1647 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1648 . m    - number of rows
1649 . n    - number of columns
1650 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1651 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1652 
1653   Output Parameter:
1654 . A - the matrix
1655 
1656   Level: intermediate
1657 
1658   Notes:
1659   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1660   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1661   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1662 
1663   The AIJ format, also called
1664   compressed row storage, is fully compatible with standard Fortran
1665   storage.  That is, the stored row and column indices can begin at
1666   either one (as in Fortran) or zero.
1667 
1668   Specify the preallocated storage with either `nz` or `nnz` (not both).
1669   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1670   allocation.
1671 
1672 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1673 @*/
1674 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1675 {
1676   PetscFunctionBegin;
1677   PetscCall(PetscKokkosInitializeCheck());
1678   PetscCall(MatCreate(comm, A));
1679   PetscCall(MatSetSizes(*A, m, n, m, n));
1680   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1681   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1682   PetscFunctionReturn(PETSC_SUCCESS);
1683 }
1684 
1685 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1686 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1687 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1688 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1689 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1690 {
1691   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1692   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1693   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1694 
1695   PetscFunctionBegin;
1696   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1697     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1698     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1699     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1700   }
1701   PetscFunctionReturn(PETSC_SUCCESS);
1702 }
1703 
1704 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1705 {
1706   const PetscInt              n         = A->rmap->n;
1707   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1708   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1709   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1710 
1711   PetscFunctionBegin;
1712   if (!factors->transpose_updated) {
1713     if (has_upper) {
1714       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1715         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1716         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1717         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1718       }
1719 
1720       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1721         if (!factors->U) {
1722           Mat_SeqAIJ *seq;
1723 
1724           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1725           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1726 
1727           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1728           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1729           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1730           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1731         } else {
1732           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1733         }
1734         // Copy Ut from host to device
1735         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1736         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1737         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1738       } else { // If U was computed on device, we also compute the transpose there
1739         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1740         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1741                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1742                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1743                                                                                                                                                                                                                                factors->aUt_d));
1744         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1745       }
1746       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1747     }
1748 
1749     // do the same for L with LU
1750     if (has_lower) {
1751       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1752         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1753         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1754         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1755       }
1756 
1757       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1758         if (!factors->L) {
1759           Mat_SeqAIJ *seq;
1760 
1761           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1762           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1763 
1764           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1765           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1766           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1767           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1768         } else {
1769           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1770         }
1771         // Copy Lt from host to device
1772         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1773         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1774         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1775       } else { // If L was computed on device, we also compute the transpose there
1776         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1777         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1778                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1779                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1780                                                                                                                                                                                                                                factors->aLt_d));
1781         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1782       }
1783       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1784     }
1785 
1786     factors->transpose_updated = PETSC_TRUE;
1787   }
1788   PetscFunctionReturn(PETSC_SUCCESS);
1789 }
1790 
1791 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1792 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1793 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1794 {
1795   auto                        exec    = PetscGetKokkosExecutionSpace();
1796   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1797   PetscInt                    m       = A->rmap->n;
1798   PetscScalarKokkosView       D       = factors->D_d;
1799   PetscScalarKokkosView       X, Y, B; // alias
1800   ConstPetscScalarKokkosView  b;
1801   PetscScalarKokkosView       x;
1802   PetscIntKokkosView         &rowperm  = factors->rowperm;
1803   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1804 
1805   PetscFunctionBegin;
1806   PetscCall(PetscLogGpuTimeBegin());
1807   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1808   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1809   PetscCall(VecGetKokkosView(bb, &b));
1810   PetscCall(VecGetKokkosViewWrite(xx, &x));
1811 
1812   // Solve U^T Y = B
1813   if (identity) { // Reorder b with the row permutation
1814     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1815     Y = factors->workVector;
1816   } else {
1817     B = factors->workVector;
1818     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1819     Y = x;
1820   }
1821   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1822 
1823   // Solve diag(D) Y' = Y.
1824   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1825   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1826 
1827   // Solve UX = Y
1828   if (identity) {
1829     X = x;
1830   } else {
1831     X = factors->workVector; // B is not needed anymore
1832   }
1833   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1834 
1835   // Reorder X with the inverse column (row) permutation
1836   if (!identity) {
1837     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1838   }
1839 
1840   PetscCall(VecRestoreKokkosView(bb, &b));
1841   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1842   PetscCall(PetscLogGpuTimeEnd());
1843   PetscFunctionReturn(PETSC_SUCCESS);
1844 }
1845 
1846 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1847 // R and C are represented by rowperm and colperm in factors.
1848 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1849 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1850 {
1851   auto                        exec    = PetscGetKokkosExecutionSpace();
1852   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1853   PetscInt                    m       = A->rmap->n;
1854   PetscScalarKokkosView       X, Y, B; // alias
1855   ConstPetscScalarKokkosView  b;
1856   PetscScalarKokkosView       x;
1857   PetscIntKokkosView         &rowperm      = factors->rowperm;
1858   PetscIntKokkosView         &colperm      = factors->colperm;
1859   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1860   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1861 
1862   PetscFunctionBegin;
1863   PetscCall(PetscLogGpuTimeBegin());
1864   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1865   PetscCall(VecGetKokkosView(bb, &b));
1866   PetscCall(VecGetKokkosViewWrite(xx, &x));
1867 
1868   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1869   if (row_identity) {
1870     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1871     Y = factors->workVector;
1872   } else {
1873     B = factors->workVector;
1874     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1875     Y = x;
1876   }
1877   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1878 
1879   // Solve U C^- x = Y
1880   if (col_identity) {
1881     X = x;
1882   } else {
1883     X = factors->workVector;
1884   }
1885   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1886 
1887   // x = C X; Reorder X with the inverse col permutation
1888   if (!col_identity) {
1889     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1890   }
1891 
1892   PetscCall(VecRestoreKokkosView(bb, &b));
1893   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1894   PetscCall(PetscLogGpuTimeEnd());
1895   PetscFunctionReturn(PETSC_SUCCESS);
1896 }
1897 
1898 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1899 // R and C are represented by rowperm and colperm in factors.
1900 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1901 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1902 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1903 {
1904   auto                        exec    = PetscGetKokkosExecutionSpace();
1905   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1906   PetscInt                    m       = A->rmap->n;
1907   PetscScalarKokkosView       X, Y, B; // alias
1908   ConstPetscScalarKokkosView  b;
1909   PetscScalarKokkosView       x;
1910   PetscIntKokkosView         &rowperm      = factors->rowperm;
1911   PetscIntKokkosView         &colperm      = factors->colperm;
1912   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1913   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1914 
1915   PetscFunctionBegin;
1916   PetscCall(PetscLogGpuTimeBegin());
1917   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1918   PetscCall(VecGetKokkosView(bb, &b));
1919   PetscCall(VecGetKokkosViewWrite(xx, &x));
1920 
1921   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1922   if (col_identity) { // Reorder b with the col permutation
1923     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1924     Y = factors->workVector;
1925   } else {
1926     B = factors->workVector;
1927     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1928     Y = x;
1929   }
1930   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1931 
1932   // Solve L^T X = Y
1933   if (row_identity) {
1934     X = x;
1935   } else {
1936     X = factors->workVector;
1937   }
1938   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1939 
1940   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1941   if (!row_identity) {
1942     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1943   }
1944 
1945   PetscCall(VecRestoreKokkosView(bb, &b));
1946   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1947   PetscCall(PetscLogGpuTimeEnd());
1948   PetscFunctionReturn(PETSC_SUCCESS);
1949 }
1950 
1951 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1952 {
1953   PetscFunctionBegin;
1954   PetscCall(MatSeqAIJKokkosSyncHost(A));
1955   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1956 
1957   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1958     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1959     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1960     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1961     const MatScalar            *Ba = b->a;
1962     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1963 
1964     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1965       // Allocate memory and copy the structure
1966       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1967       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1968       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1969       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1970       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1971       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1972 
1973       PetscInt *Li = factors->iL_h.data();
1974       PetscInt *Lj = factors->jL_h.data();
1975       PetscInt *Ui = factors->iU_h.data();
1976       PetscInt *Uj = factors->jU_h.data();
1977 
1978       Li[0] = Ui[0] = 0;
1979       for (PetscInt i = 0; i < m; i++) {
1980         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1981         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1982 
1983         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1984         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1985 
1986         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1987         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1988 
1989         Li[i + 1] = Li[i] + llen + 1;
1990         Ui[i + 1] = Ui[i] + ulen;
1991       }
1992 
1993       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1994       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1995       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1996       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1997       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1998       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1999 
2000       // Copy row/col permutation to device
2001       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2002       PetscBool row_identity;
2003       PetscCall(ISIdentity(rowperm, &row_identity));
2004       if (!row_identity) {
2005         const PetscInt *ip;
2006 
2007         PetscCall(ISGetIndices(rowperm, &ip));
2008         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
2009         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2010         PetscCall(ISRestoreIndices(rowperm, &ip));
2011         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2012       }
2013 
2014       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
2015       PetscBool col_identity;
2016       PetscCall(ISIdentity(colperm, &col_identity));
2017       if (!col_identity) {
2018         const PetscInt *ip;
2019 
2020         PetscCall(ISGetIndices(colperm, &ip));
2021         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2022         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2023         PetscCall(ISRestoreIndices(colperm, &ip));
2024         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2025       }
2026 
2027       /* Create sptrsv handles for L, U and their transpose */
2028 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2029       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2030 #else
2031       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2032 #endif
2033       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2034       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2035       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2036       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2037     }
2038 
2039     // Copy the value
2040     for (PetscInt i = 0; i < m; i++) {
2041       PetscInt        llen = Bi[i + 1] - Bi[i];
2042       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2043       const PetscInt *Li   = factors->iL_h.data();
2044       const PetscInt *Ui   = factors->iU_h.data();
2045 
2046       PetscScalar *La = factors->aL_h.data();
2047       PetscScalar *Ua = factors->aU_h.data();
2048 
2049       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2050       La[Li[i] + llen] = 1.0;                      // diagonal entry
2051 
2052       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2053       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2054     }
2055 
2056     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2057     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2058     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2059     factors->transpose_updated         = PETSC_FALSE;
2060     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2061 
2062     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2063     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2064   }
2065 
2066   B->ops->matsolve          = NULL;
2067   B->ops->matsolvetranspose = NULL;
2068   PetscFunctionReturn(PETSC_SUCCESS);
2069 }
2070 
2071 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2072 {
2073   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2074   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2075   PetscInt                    fill_lev = info->levels;
2076 
2077   PetscFunctionBegin;
2078   PetscCall(PetscLogGpuTimeBegin());
2079   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2080   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2081 
2082   auto a_d = aijkok->a_dual.view_device();
2083   auto i_d = aijkok->i_dual.view_device();
2084   auto j_d = aijkok->j_dual.view_device();
2085 
2086   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2087 
2088   B->assembled              = PETSC_TRUE;
2089   B->preallocated           = PETSC_TRUE;
2090   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2091   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2092   B->ops->matsolve          = NULL;
2093   B->ops->matsolvetranspose = NULL;
2094 
2095   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2096   factors->transpose_updated         = PETSC_FALSE;
2097   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2098   /* TODO: log flops, but how to know that? */
2099   PetscCall(PetscLogGpuTimeEnd());
2100   PetscFunctionReturn(PETSC_SUCCESS);
2101 }
2102 
2103 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2104 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2105 {
2106   Mat_SeqAIJKokkos           *aijkok;
2107   Mat_SeqAIJ                 *b;
2108   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2109   PetscInt                    fill_lev = info->levels;
2110   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2111   PetscInt                    n        = A->rmap->n;
2112 
2113   PetscFunctionBegin;
2114   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2115   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2116 
2117   /* Create a spiluk handle and then do symbolic factorization */
2118   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2119   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2120 
2121   auto spiluk_handle = factors->kh.get_spiluk_handle();
2122 
2123   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2124   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2125   Kokkos::realloc(factors->iU_d, n + 1);
2126   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2127 
2128   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2129   auto i_d = aijkok->i_dual.view_device();
2130   auto j_d = aijkok->j_dual.view_device();
2131   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2132   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2133 
2134   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2135   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2136   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2137   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2138 
2139   /* TODO: add options to select sptrsv algorithms */
2140   /* Create sptrsv handles for L, U and their transpose */
2141 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2142   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2143 #else
2144   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2145 #endif
2146 
2147   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2148   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2149   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2150   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2151 
2152   /* Fill fields of the factor matrix B */
2153   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2154   b     = (Mat_SeqAIJ *)B->data;
2155   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2156   B->info.fill_ratio_given  = info->fill;
2157   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2158 
2159   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2160   PetscFunctionReturn(PETSC_SUCCESS);
2161 }
2162 
2163 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2164 {
2165   PetscFunctionBegin;
2166   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2167   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2168   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2169   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2170   PetscFunctionReturn(PETSC_SUCCESS);
2171 }
2172 
2173 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2174 {
2175   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2176 
2177   PetscFunctionBegin;
2178   if (!info->factoronhost) {
2179     PetscCall(ISIdentity(isrow, &row_identity));
2180     PetscCall(ISIdentity(iscol, &col_identity));
2181   }
2182 
2183   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2184   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2185 
2186   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2187     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2188   } else {
2189     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2190     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2191   }
2192   PetscFunctionReturn(PETSC_SUCCESS);
2193 }
2194 
2195 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2196 {
2197   PetscFunctionBegin;
2198   PetscCall(MatSeqAIJKokkosSyncHost(A));
2199   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2200 
2201   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2202     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2203     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2204     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2205     const MatScalar            *Ba = b->a;
2206     PetscInt                    m  = B->rmap->n;
2207 
2208     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2209       // Allocate memory and copy the structure
2210       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2211       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2212       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2213       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2214       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2215       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2216 
2217       // Build jU_h from the skewed Aj
2218       PetscInt *Uj = factors->jU_h.data();
2219       for (PetscInt i = 0; i < m; i++) {
2220         PetscInt ulen = Bi[i + 1] - Bi[i];
2221         Uj[Bi[i]]     = i;                                              // diagonal entry
2222         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2223       }
2224 
2225       // Copy iU, jU to device
2226       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2227       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2228 
2229       // Copy row/col permutation to device
2230       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2231       PetscBool row_identity;
2232       PetscCall(ISIdentity(rowperm, &row_identity));
2233       if (!row_identity) {
2234         const PetscInt *ip;
2235 
2236         PetscCall(ISGetIndices(rowperm, &ip));
2237         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2238         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2239         PetscCall(ISRestoreIndices(rowperm, &ip));
2240         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2241       }
2242 
2243       // Create sptrsv handles for U and U^T
2244 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2245       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2246 #else
2247       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2248 #endif
2249       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2250       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2251     }
2252     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2253     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2254     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2255 
2256     // Copy the value
2257     PetscScalar *Ua = factors->aU_h.data();
2258     PetscScalar *D  = factors->D_h.data();
2259     for (PetscInt i = 0; i < m; i++) {
2260       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2261       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2262       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2263     }
2264     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2265     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2266 
2267     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2268     factors->transpose_updated         = PETSC_FALSE;
2269   }
2270 
2271   B->ops->matsolve          = NULL;
2272   B->ops->matsolvetranspose = NULL;
2273   PetscFunctionReturn(PETSC_SUCCESS);
2274 }
2275 
2276 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2277 {
2278   PetscFunctionBegin;
2279   if (info->solveonhost) {
2280     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2281     PetscCall(MatSetType(B, MATSEQSBAIJ));
2282     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2283   }
2284 
2285   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2286 
2287   if (!info->solveonhost) {
2288     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2289     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2290     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2291     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2292   }
2293   PetscFunctionReturn(PETSC_SUCCESS);
2294 }
2295 
2296 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2297 {
2298   PetscFunctionBegin;
2299   if (info->solveonhost) {
2300     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2301     PetscCall(MatSetType(B, MATSEQSBAIJ));
2302     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2303   }
2304 
2305   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2306 
2307   if (!info->solveonhost) {
2308     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2309     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2310     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2311     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2312   }
2313   PetscFunctionReturn(PETSC_SUCCESS);
2314 }
2315 
2316 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2317 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2318 {
2319   PetscFunctionBegin;
2320   *type = MATSOLVERKOKKOS;
2321   PetscFunctionReturn(PETSC_SUCCESS);
2322 }
2323 
2324 /*MC
2325   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2326   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2327 
2328   Level: beginner
2329 
2330 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2331 M*/
2332 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2333 {
2334   PetscInt n = A->rmap->n;
2335   MPI_Comm comm;
2336 
2337   PetscFunctionBegin;
2338   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2339   PetscCall(MatCreate(comm, B));
2340   PetscCall(MatSetSizes(*B, n, n, n, n));
2341   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2342   (*B)->factortype = ftype;
2343   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2344   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2345   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2346 
2347   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2348     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2349     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2350     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2351     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2352     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2353   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2354     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2355     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2356     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2357     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2358   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2359 
2360   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2361   (*B)->canuseordering = PETSC_TRUE;
2362   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2363   PetscFunctionReturn(PETSC_SUCCESS);
2364 }
2365 
2366 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2367 {
2368   PetscFunctionBegin;
2369   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2370   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2371   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2372   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2373   PetscFunctionReturn(PETSC_SUCCESS);
2374 }
2375 
2376 /* Utility to print out a KokkosCsrMatrix for debugging */
2377 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2378 {
2379   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2380   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2381   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2382   const PetscInt    *i  = iv.data();
2383   const PetscInt    *j  = jv.data();
2384   const PetscScalar *a  = av.data();
2385   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2386 
2387   PetscFunctionBegin;
2388   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2389   for (PetscInt k = 0; k < m; k++) {
2390     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2391     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2392     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2393   }
2394   PetscFunctionReturn(PETSC_SUCCESS);
2395 }
2396