xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 7e1a0bbe36d2be40a00a95404ece00db4857f70d)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 #define DISABLE_CUSPARSE_DEPRECATED
20 #include <KokkosSparse_spmv.hpp>
21 
22 #include <KokkosSparse_spiluk.hpp>
23 #include <KokkosSparse_sptrsv.hpp>
24 #include <KokkosSparse_spgemm.hpp>
25 #include <KokkosSparse_spadd.hpp>
26 #include <KokkosBatched_LU_Decl.hpp>
27 #include <KokkosBatched_InverseLU_Decl.hpp>
28 
29 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
30 
31 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
32   #include <KokkosSparse_Utils.hpp>
33 using KokkosSparse::sort_crs_matrix;
34 using KokkosSparse::Impl::transpose_matrix;
35 #else
36   #include <KokkosKernels_Sorting.hpp>
37 using KokkosKernels::sort_crs_matrix;
38 using KokkosKernels::Impl::transpose_matrix;
39 #endif
40 
41 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
42 using KokkosSparse::spiluk_symbolic;
43 using KokkosSparse::spiluk_numeric;
44 using KokkosSparse::sptrsv_symbolic;
45 using KokkosSparse::sptrsv_solve;
46 using KokkosSparse::Experimental::SPTRSVAlgorithm;
47 using KokkosSparse::Experimental::SPILUKAlgorithm;
48 #else
49 using KokkosSparse::Experimental::spiluk_symbolic;
50 using KokkosSparse::Experimental::spiluk_numeric;
51 using KokkosSparse::Experimental::sptrsv_symbolic;
52 using KokkosSparse::Experimental::sptrsv_solve;
53 using KokkosSparse::Experimental::SPTRSVAlgorithm;
54 using KokkosSparse::Experimental::SPILUKAlgorithm;
55 #endif
56 
57 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
58 
59 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
60    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
61    In the latter case, it is important to set a_dual's sync state correctly.
62  */
63 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
64 {
65   Mat_SeqAIJ       *aijseq;
66   Mat_SeqAIJKokkos *aijkok;
67 
68   PetscFunctionBegin;
69   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
70   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
71 
72   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
73   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
74 
75   /* If aijkok does not exist, we just copy i, j to device.
76      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
77      In both cases, we build a new aijkok structure.
78   */
79   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
80     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
81       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
82       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
83       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
84       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
85       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
86       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
87       aijseq->free_a  = PETSC_TRUE;
88       aijseq->free_ij = PETSC_TRUE;
89       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
90          we have the CsrMatrix on device first and then copy to host, followed by
91          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
92          One could improve it by not using NULL garray.
93       */
94     }
95     delete aijkok;
96     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
97     A->spptr = aijkok;
98   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
99     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
100     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
101     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
102   }
103   PetscFunctionReturn(PETSC_SUCCESS);
104 }
105 
106 /* Sync CSR data to device if not yet */
107 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
108 {
109   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
110 
111   PetscFunctionBegin;
112   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
113   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
114   if (aijkok->a_dual.need_sync_device()) {
115     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
116     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
117     aijkok->hermitian_updated = PETSC_FALSE;
118   }
119   PetscFunctionReturn(PETSC_SUCCESS);
120 }
121 
122 /* Mark the CSR data on device as modified */
123 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
124 {
125   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
126 
127   PetscFunctionBegin;
128   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
129   aijkok->a_dual.clear_sync_state();
130   aijkok->a_dual.modify_device();
131   aijkok->transpose_updated = PETSC_FALSE;
132   aijkok->hermitian_updated = PETSC_FALSE;
133   PetscCall(MatSeqAIJInvalidateDiagonal(A));
134   PetscCall(PetscObjectStateIncrease((PetscObject)A));
135   PetscFunctionReturn(PETSC_SUCCESS);
136 }
137 
138 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
139 {
140   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
141   auto              exec   = PetscGetKokkosExecutionSpace();
142 
143   PetscFunctionBegin;
144   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
145   /* We do not expect one needs factors on host  */
146   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
147   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
148   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
149   PetscFunctionReturn(PETSC_SUCCESS);
150 }
151 
152 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
153 {
154   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
155 
156   PetscFunctionBegin;
157   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
158     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
159     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
160     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
161   */
162   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
164     *array = aijkok->a_dual.view_host().data();
165   } else { /* Happens when calling MatSetValues on a newly created matrix */
166     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
167   }
168   PetscFunctionReturn(PETSC_SUCCESS);
169 }
170 
171 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
172 {
173   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
174 
175   PetscFunctionBegin;
176   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
177   PetscFunctionReturn(PETSC_SUCCESS);
178 }
179 
180 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
181 {
182   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
183 
184   PetscFunctionBegin;
185   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
186     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
187     *array = aijkok->a_dual.view_host().data();
188   } else {
189     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
190   }
191   PetscFunctionReturn(PETSC_SUCCESS);
192 }
193 
194 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
195 {
196   PetscFunctionBegin;
197   *array = NULL;
198   PetscFunctionReturn(PETSC_SUCCESS);
199 }
200 
201 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
202 {
203   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
204 
205   PetscFunctionBegin;
206   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
207     *array = aijkok->a_dual.view_host().data();
208   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
209     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
210   }
211   PetscFunctionReturn(PETSC_SUCCESS);
212 }
213 
214 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
215 {
216   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
217 
218   PetscFunctionBegin;
219   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
220     aijkok->a_dual.clear_sync_state();
221     aijkok->a_dual.modify_host();
222   }
223   PetscFunctionReturn(PETSC_SUCCESS);
224 }
225 
226 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
227 {
228   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
229 
230   PetscFunctionBegin;
231   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
232 
233   if (i) *i = aijkok->i_device_data();
234   if (j) *j = aijkok->j_device_data();
235   if (a) {
236     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
237     *a = aijkok->a_device_data();
238   }
239   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
240   PetscFunctionReturn(PETSC_SUCCESS);
241 }
242 
243 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
244 {
245   PetscFunctionBegin;
246   *mtype = PETSC_MEMTYPE_KOKKOS;
247   PetscFunctionReturn(PETSC_SUCCESS);
248 }
249 
250 /*
251   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
252 
253   Input Parameter:
254 .  A       - the MATSEQAIJKOKKOS matrix
255 
256   Output Parameters:
257 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
258 -  T_d    - the transpose on device, whose value array is allocated but not initialized
259 */
260 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
261 {
262   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
263   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
264   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
265   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
266   MatRowMapType          *Ti = Ti_h.data();
267   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
268   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
269   PetscInt               *Tj   = Tj_h.data();
270   PetscInt               *perm = perm_h.data();
271   PetscInt               *offset;
272 
273   PetscFunctionBegin;
274   // Populate Ti
275   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
276   Ti++;
277   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
278   Ti--;
279   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
280 
281   // Populate Tj and the permutation array
282   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
283   for (PetscInt i = 0; i < m; i++) {
284     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
285       PetscInt r    = Aj[j];                       // row r of T
286       PetscInt disp = Ti[r] + offset[r];
287 
288       Tj[disp]   = i; // col i of T
289       perm[disp] = j;
290       offset[r]++;
291     }
292   }
293   PetscCall(PetscFree(offset));
294 
295   // Sort each row of T, along with the permutation array
296   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
297 
298   // Output perm and T on device
299   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
300   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
301   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
302   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
303   PetscFunctionReturn(PETSC_SUCCESS);
304 }
305 
306 // Generate the transpose on device and cache it internally
307 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
308 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
309 {
310   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
311   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
312   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
313   KokkosCsrMatrix  &T = akok->csrmatT;
314 
315   PetscFunctionBegin;
316   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
317   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
318 
319   const auto &Aa = akok->a_dual.view_device();
320 
321   if (A->symmetric == PETSC_BOOL3_TRUE) {
322     *csrmatT = akok->csrmat;
323   } else {
324     // See if we already have a cached transpose and its value is up to date
325     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
326       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
327         const auto &perm = akok->transpose_perm; // get the permutation array
328         auto       &Ta   = T.values;
329 
330         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
331       }
332     } else { // Generate T of size n x m for the first time
333       MatRowMapKokkosView perm;
334 
335       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
336       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
337       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
338     }
339     akok->transpose_updated = PETSC_TRUE;
340     *csrmatT                = akok->csrmatT;
341   }
342   PetscFunctionReturn(PETSC_SUCCESS);
343 }
344 
345 // Generate the Hermitian on device and cache it internally
346 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
347 {
348   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
349   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
350   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
351   KokkosCsrMatrix  &T = akok->csrmatH;
352 
353   PetscFunctionBegin;
354   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
355   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
356 
357   const auto &Aa = akok->a_dual.view_device();
358 
359   if (A->hermitian == PETSC_BOOL3_TRUE) {
360     *csrmatH = akok->csrmat;
361   } else {
362     // See if we already have a cached hermitian and its value is up to date
363     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
364       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
365         const auto &perm = akok->transpose_perm; // get the permutation array
366         auto       &Ta   = T.values;
367 
368         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
369       }
370     } else { // Generate T of size n x m for the first time
371       MatRowMapKokkosView perm;
372 
373       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
374       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
375       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
376     }
377     akok->hermitian_updated = PETSC_TRUE;
378     *csrmatH                = akok->csrmatH;
379   }
380   PetscFunctionReturn(PETSC_SUCCESS);
381 }
382 
383 /* y = A x */
384 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
385 {
386   Mat_SeqAIJKokkos          *aijkok;
387   ConstPetscScalarKokkosView xv;
388   PetscScalarKokkosView      yv;
389 
390   PetscFunctionBegin;
391   PetscCall(PetscLogGpuTimeBegin());
392   PetscCall(MatSeqAIJKokkosSyncDevice(A));
393   PetscCall(VecGetKokkosView(xx, &xv));
394   PetscCall(VecGetKokkosViewWrite(yy, &yv));
395   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
396   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
397   PetscCall(VecRestoreKokkosView(xx, &xv));
398   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
399   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
400   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
401   PetscCall(PetscLogGpuTimeEnd());
402   PetscFunctionReturn(PETSC_SUCCESS);
403 }
404 
405 /* y = A^T x */
406 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
407 {
408   Mat_SeqAIJKokkos          *aijkok;
409   const char                *mode;
410   ConstPetscScalarKokkosView xv;
411   PetscScalarKokkosView      yv;
412   KokkosCsrMatrix            csrmat;
413 
414   PetscFunctionBegin;
415   PetscCall(PetscLogGpuTimeBegin());
416   PetscCall(MatSeqAIJKokkosSyncDevice(A));
417   PetscCall(VecGetKokkosView(xx, &xv));
418   PetscCall(VecGetKokkosViewWrite(yy, &yv));
419   if (A->form_explicit_transpose) {
420     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
421     mode = "N";
422   } else {
423     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
424     csrmat = aijkok->csrmat;
425     mode   = "T";
426   }
427   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
428   PetscCall(VecRestoreKokkosView(xx, &xv));
429   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
430   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
431   PetscCall(PetscLogGpuTimeEnd());
432   PetscFunctionReturn(PETSC_SUCCESS);
433 }
434 
435 /* y = A^H x */
436 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
437 {
438   Mat_SeqAIJKokkos          *aijkok;
439   const char                *mode;
440   ConstPetscScalarKokkosView xv;
441   PetscScalarKokkosView      yv;
442   KokkosCsrMatrix            csrmat;
443 
444   PetscFunctionBegin;
445   PetscCall(PetscLogGpuTimeBegin());
446   PetscCall(MatSeqAIJKokkosSyncDevice(A));
447   PetscCall(VecGetKokkosView(xx, &xv));
448   PetscCall(VecGetKokkosViewWrite(yy, &yv));
449   if (A->form_explicit_transpose) {
450     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
451     mode = "N";
452   } else {
453     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
454     csrmat = aijkok->csrmat;
455     mode   = "C";
456   }
457   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
458   PetscCall(VecRestoreKokkosView(xx, &xv));
459   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
460   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
461   PetscCall(PetscLogGpuTimeEnd());
462   PetscFunctionReturn(PETSC_SUCCESS);
463 }
464 
465 /* z = A x + y */
466 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
467 {
468   Mat_SeqAIJKokkos          *aijkok;
469   ConstPetscScalarKokkosView xv;
470   PetscScalarKokkosView      zv;
471 
472   PetscFunctionBegin;
473   PetscCall(PetscLogGpuTimeBegin());
474   PetscCall(MatSeqAIJKokkosSyncDevice(A));
475   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
476   PetscCall(VecGetKokkosView(xx, &xv));
477   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
478   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
479   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
480   PetscCall(VecRestoreKokkosView(xx, &xv));
481   PetscCall(VecRestoreKokkosView(zz, &zv));
482   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
483   PetscCall(PetscLogGpuTimeEnd());
484   PetscFunctionReturn(PETSC_SUCCESS);
485 }
486 
487 /* z = A^T x + y */
488 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
489 {
490   Mat_SeqAIJKokkos          *aijkok;
491   const char                *mode;
492   ConstPetscScalarKokkosView xv;
493   PetscScalarKokkosView      zv;
494   KokkosCsrMatrix            csrmat;
495 
496   PetscFunctionBegin;
497   PetscCall(PetscLogGpuTimeBegin());
498   PetscCall(MatSeqAIJKokkosSyncDevice(A));
499   if (zz != yy) PetscCall(VecCopy(yy, zz));
500   PetscCall(VecGetKokkosView(xx, &xv));
501   PetscCall(VecGetKokkosView(zz, &zv));
502   if (A->form_explicit_transpose) {
503     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
504     mode = "N";
505   } else {
506     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
507     csrmat = aijkok->csrmat;
508     mode   = "T";
509   }
510   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
511   PetscCall(VecRestoreKokkosView(xx, &xv));
512   PetscCall(VecRestoreKokkosView(zz, &zv));
513   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
514   PetscCall(PetscLogGpuTimeEnd());
515   PetscFunctionReturn(PETSC_SUCCESS);
516 }
517 
518 /* z = A^H x + y */
519 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
520 {
521   Mat_SeqAIJKokkos          *aijkok;
522   const char                *mode;
523   ConstPetscScalarKokkosView xv;
524   PetscScalarKokkosView      zv;
525   KokkosCsrMatrix            csrmat;
526 
527   PetscFunctionBegin;
528   PetscCall(PetscLogGpuTimeBegin());
529   PetscCall(MatSeqAIJKokkosSyncDevice(A));
530   if (zz != yy) PetscCall(VecCopy(yy, zz));
531   PetscCall(VecGetKokkosView(xx, &xv));
532   PetscCall(VecGetKokkosView(zz, &zv));
533   if (A->form_explicit_transpose) {
534     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
535     mode = "N";
536   } else {
537     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
538     csrmat = aijkok->csrmat;
539     mode   = "C";
540   }
541   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
542   PetscCall(VecRestoreKokkosView(xx, &xv));
543   PetscCall(VecRestoreKokkosView(zz, &zv));
544   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
545   PetscCall(PetscLogGpuTimeEnd());
546   PetscFunctionReturn(PETSC_SUCCESS);
547 }
548 
549 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
550 {
551   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
552 
553   PetscFunctionBegin;
554   switch (op) {
555   case MAT_FORM_EXPLICIT_TRANSPOSE:
556     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
557     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
558     A->form_explicit_transpose = flg;
559     break;
560   default:
561     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
562     break;
563   }
564   PetscFunctionReturn(PETSC_SUCCESS);
565 }
566 
567 /* Depending on reuse, either build a new mat, or use the existing mat */
568 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
569 {
570   Mat_SeqAIJ *aseq;
571 
572   PetscFunctionBegin;
573   PetscCall(PetscKokkosInitializeCheck());
574   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
575     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
576     PetscCall(MatSetType(*newmat, mtype));
577   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
578     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
579   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
580     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
581     PetscCall(PetscFree(A->defaultvectype));
582     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
583     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
584     PetscCall(MatSetOps_SeqAIJKokkos(A));
585     aseq = static_cast<Mat_SeqAIJ *>(A->data);
586     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
587       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
588       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
589     }
590   }
591   PetscFunctionReturn(PETSC_SUCCESS);
592 }
593 
594 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
595    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
596  */
597 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
598 {
599   Mat_SeqAIJ       *bseq;
600   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
601   Mat               mat;
602 
603   PetscFunctionBegin;
604   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
605   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
606   mat = *B;
607   if (A->assembled) {
608     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
609     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
610     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
611     /* Now copy values to B if needed */
612     if (dupOption == MAT_COPY_VALUES) {
613       if (akok->a_dual.need_sync_device()) {
614         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
615         bkok->a_dual.modify_host();
616       } else { /* If device has the latest data, we only copy data on device */
617         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
618         bkok->a_dual.modify_device();
619       }
620     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
621       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
622       bkok->a_dual.modify_host();
623     }
624     mat->spptr = bkok;
625   }
626 
627   PetscCall(PetscFree(mat->defaultvectype));
628   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
629   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
630   PetscCall(MatSetOps_SeqAIJKokkos(mat));
631   PetscFunctionReturn(PETSC_SUCCESS);
632 }
633 
634 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
635 {
636   Mat               At;
637   KokkosCsrMatrix   internT;
638   Mat_SeqAIJKokkos *atkok, *bkok;
639 
640   PetscFunctionBegin;
641   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
642   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
643   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
644     /* Deep copy internT, as we want to isolate the internal transpose */
645     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
646     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
647     if (reuse == MAT_INITIAL_MATRIX) *B = At;
648     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
649   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
650     if ((*B)->assembled) {
651       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
652       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
653       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
654     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
655       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
656       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
657       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
658       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
659       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
660     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
661   }
662   PetscFunctionReturn(PETSC_SUCCESS);
663 }
664 
665 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
666 {
667   Mat_SeqAIJKokkos *aijkok;
668 
669   PetscFunctionBegin;
670   if (A->factortype == MAT_FACTOR_NONE) {
671     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
672     delete aijkok;
673   } else {
674     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
675   }
676   A->spptr = NULL;
677   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
678   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
679   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
680 #if defined(PETSC_HAVE_HYPRE)
681   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
682 #endif
683   PetscCall(MatDestroy_SeqAIJ(A));
684   PetscFunctionReturn(PETSC_SUCCESS);
685 }
686 
687 /*MC
688    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
689 
690    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
691 
692    Options Database Key:
693 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
694 
695   Level: beginner
696 
697 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
698 M*/
699 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
700 {
701   PetscFunctionBegin;
702   PetscCall(PetscKokkosInitializeCheck());
703   PetscCall(MatCreate_SeqAIJ(A));
704   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
705   PetscFunctionReturn(PETSC_SUCCESS);
706 }
707 
708 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
709 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
710 {
711   Mat_SeqAIJ         *a, *b;
712   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
713   MatScalarKokkosView aa, ba, ca;
714   MatRowMapKokkosView ai, bi, ci;
715   MatColIdxKokkosView aj, bj, cj;
716   PetscInt            m, n, nnz, aN;
717 
718   PetscFunctionBegin;
719   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
720   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
721   PetscAssertPointer(C, 4);
722   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
723   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
724   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
725   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
726 
727   PetscCall(MatSeqAIJKokkosSyncDevice(A));
728   PetscCall(MatSeqAIJKokkosSyncDevice(B));
729   a    = static_cast<Mat_SeqAIJ *>(A->data);
730   b    = static_cast<Mat_SeqAIJ *>(B->data);
731   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
732   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
733   aa   = akok->a_dual.view_device();
734   ai   = akok->i_dual.view_device();
735   ba   = bkok->a_dual.view_device();
736   bi   = bkok->i_dual.view_device();
737   m    = A->rmap->n; /* M, N and nnz of C */
738   n    = A->cmap->n + B->cmap->n;
739   nnz  = a->nz + b->nz;
740   aN   = A->cmap->n; /* N of A */
741   if (reuse == MAT_INITIAL_MATRIX) {
742     aj      = akok->j_dual.view_device();
743     bj      = bkok->j_dual.view_device();
744     auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
745     auto ci = MatRowMapKokkosView("i", ai.extent(0));
746     auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
747 
748     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
749     Kokkos::parallel_for(
750       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
751         PetscInt i       = t.league_rank(); /* row i */
752         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
753 
754         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
755                                                    ci(i) = coffset;
756                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
757         });
758 
759         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
760           if (k < alen) {
761             ca(coffset + k) = aa(ai(i) + k);
762             cj(coffset + k) = aj(ai(i) + k);
763           } else {
764             ca(coffset + k) = ba(bi(i) + k - alen);
765             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
766           }
767         });
768       });
769     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
770     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
771   } else if (reuse == MAT_REUSE_MATRIX) {
772     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
773     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
774     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
775     ca   = ckok->a_dual.view_device();
776     ci   = ckok->i_dual.view_device();
777 
778     Kokkos::parallel_for(
779       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
780         PetscInt i    = t.league_rank(); /* row i */
781         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
782         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
783           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
784           else ca(ci(i) + k) = ba(bi(i) + k - alen);
785         });
786       });
787     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
788   }
789   PetscFunctionReturn(PETSC_SUCCESS);
790 }
791 
792 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
793 {
794   PetscFunctionBegin;
795   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
796   PetscFunctionReturn(PETSC_SUCCESS);
797 }
798 
799 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
800 {
801   Mat_Product                 *product = C->product;
802   Mat                          A, B;
803   bool                         transA, transB; /* use bool, since KK needs this type */
804   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
805   Mat_SeqAIJ                  *c;
806   MatProductData_SeqAIJKokkos *pdata;
807   KokkosCsrMatrix              csrmatA, csrmatB;
808 
809   PetscFunctionBegin;
810   MatCheckProduct(C, 1);
811   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
812   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
813 
814   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
815   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
816   // we still do numeric.
817   if (pdata->reusesym) { // numeric reuses results from symbolic
818     pdata->reusesym = PETSC_FALSE;
819     PetscFunctionReturn(PETSC_SUCCESS);
820   }
821 
822   switch (product->type) {
823   case MATPRODUCT_AB:
824     transA = false;
825     transB = false;
826     break;
827   case MATPRODUCT_AtB:
828     transA = true;
829     transB = false;
830     break;
831   case MATPRODUCT_ABt:
832     transA = false;
833     transB = true;
834     break;
835   default:
836     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
837   }
838 
839   A = product->A;
840   B = product->B;
841   PetscCall(MatSeqAIJKokkosSyncDevice(A));
842   PetscCall(MatSeqAIJKokkosSyncDevice(B));
843   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
844   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
845   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
846 
847   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
848 
849   csrmatA = akok->csrmat;
850   csrmatB = bkok->csrmat;
851 
852   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
853   if (transA) {
854     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
855     transA = false;
856   }
857 
858   if (transB) {
859     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
860     transB = false;
861   }
862   PetscCall(PetscLogGpuTimeBegin());
863   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
864 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
865   auto spgemmHandle = pdata->kh.get_spgemm_handle();
866   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
867 #endif
868 
869   PetscCall(PetscLogGpuTimeEnd());
870   PetscCall(MatSeqAIJKokkosModifyDevice(C));
871   /* shorter version of MatAssemblyEnd_SeqAIJ */
872   c = (Mat_SeqAIJ *)C->data;
873   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
874   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
875   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
876   c->reallocs         = 0;
877   C->info.mallocs     = 0;
878   C->info.nz_unneeded = 0;
879   C->assembled = C->was_assembled = PETSC_TRUE;
880   C->num_ass++;
881   PetscFunctionReturn(PETSC_SUCCESS);
882 }
883 
884 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
885 {
886   Mat_Product                 *product = C->product;
887   MatProductType               ptype;
888   Mat                          A, B;
889   bool                         transA, transB;
890   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
891   MatProductData_SeqAIJKokkos *pdata;
892   MPI_Comm                     comm;
893   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
894 
895   PetscFunctionBegin;
896   MatCheckProduct(C, 1);
897   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
898   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
899   A = product->A;
900   B = product->B;
901   PetscCall(MatSeqAIJKokkosSyncDevice(A));
902   PetscCall(MatSeqAIJKokkosSyncDevice(B));
903   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
904   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
905   csrmatA = akok->csrmat;
906   csrmatB = bkok->csrmat;
907 
908   ptype = product->type;
909   // Take advantage of the symmetry if true
910   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
911     ptype                                          = MATPRODUCT_AB;
912     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
913   }
914   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
915     ptype                                          = MATPRODUCT_AB;
916     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
917   }
918 
919   switch (ptype) {
920   case MATPRODUCT_AB:
921     transA = false;
922     transB = false;
923     PetscCall(MatSetBlockSizesFromMats(C, A, B));
924     break;
925   case MATPRODUCT_AtB:
926     transA = true;
927     transB = false;
928     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
929     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
930     break;
931   case MATPRODUCT_ABt:
932     transA = false;
933     transB = true;
934     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
935     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
936     break;
937   default:
938     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
939   }
940   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
941   pdata->reusesym = product->api_user;
942 
943   /* TODO: add command line options to select spgemm algorithms */
944   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
945 
946   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
947 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
948   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
949   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
950   #endif
951 #endif
952   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
953 
954   PetscCall(PetscLogGpuTimeBegin());
955   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
956   if (transA) {
957     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
958     transA = false;
959   }
960 
961   if (transB) {
962     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
963     transB = false;
964   }
965 
966   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
967   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
968     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
969     calling new Mat_SeqAIJKokkos().
970     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
971   */
972   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
973 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
974   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
975   auto spgemmHandle = pdata->kh.get_spgemm_handle();
976   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
977 #endif
978   PetscCall(PetscLogGpuTimeEnd());
979 
980   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
981   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
982   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
983   PetscFunctionReturn(PETSC_SUCCESS);
984 }
985 
986 /* handles sparse matrix matrix ops */
987 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
988 {
989   Mat_Product *product = mat->product;
990   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
991 
992   PetscFunctionBegin;
993   MatCheckProduct(mat, 1);
994   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
995   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
996   if (Biskok && Ciskok) {
997     switch (product->type) {
998     case MATPRODUCT_AB:
999     case MATPRODUCT_AtB:
1000     case MATPRODUCT_ABt:
1001       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1002       break;
1003     case MATPRODUCT_PtAP:
1004     case MATPRODUCT_RARt:
1005     case MATPRODUCT_ABC:
1006       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1007       break;
1008     default:
1009       break;
1010     }
1011   } else { /* fallback for AIJ */
1012     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1013   }
1014   PetscFunctionReturn(PETSC_SUCCESS);
1015 }
1016 
1017 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1018 {
1019   Mat_SeqAIJKokkos *aijkok;
1020 
1021   PetscFunctionBegin;
1022   PetscCall(PetscLogGpuTimeBegin());
1023   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1024   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1025   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1026   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1027   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1028   PetscCall(PetscLogGpuTimeEnd());
1029   PetscFunctionReturn(PETSC_SUCCESS);
1030 }
1031 
1032 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1033 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1034 {
1035   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1036 
1037   PetscFunctionBegin;
1038   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1039     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1040 
1041     PetscCall(PetscLogGpuTimeBegin());
1042     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1043     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1044     const auto &Aa     = aijkok->a_dual.view_device();
1045     const auto &Adiag  = aijkok->diag_dual.view_device();
1046     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1047     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1048     PetscCall(PetscLogGpuFlops(n));
1049     PetscCall(PetscLogGpuTimeEnd());
1050   } else { // need reassembly, very slow!
1051     PetscCall(MatShift_Basic(A, a));
1052   }
1053   PetscFunctionReturn(PETSC_SUCCESS);
1054 }
1055 
1056 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1057 {
1058   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1059 
1060   PetscFunctionBegin;
1061   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1062     ConstPetscScalarKokkosView dv;
1063     PetscInt                   n, nv;
1064 
1065     PetscCall(PetscLogGpuTimeBegin());
1066     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1067     PetscCall(VecGetKokkosView(D, &dv));
1068     PetscCall(VecGetLocalSize(D, &nv));
1069     n = PetscMin(Y->rmap->n, Y->cmap->n);
1070     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1071 
1072     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1073     const auto &Aa     = aijkok->a_dual.view_device();
1074     const auto &Adiag  = aijkok->diag_dual.view_device();
1075     PetscCallCXX(Kokkos::parallel_for(
1076       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1077         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1078         else Aa(Adiag(i)) += dv(i);
1079       }));
1080     PetscCall(VecRestoreKokkosView(D, &dv));
1081     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1082     PetscCall(PetscLogGpuFlops(n));
1083     PetscCall(PetscLogGpuTimeEnd());
1084   } else { // need reassembly, very slow!
1085     PetscCall(MatDiagonalSet_Default(Y, D, is));
1086   }
1087   PetscFunctionReturn(PETSC_SUCCESS);
1088 }
1089 
1090 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1091 {
1092   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1093   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1094   ConstPetscScalarKokkosView lv, rv;
1095 
1096   PetscFunctionBegin;
1097   PetscCall(PetscLogGpuTimeBegin());
1098   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1099   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1100   const auto &Aa     = aijkok->a_dual.view_device();
1101   const auto &Ai     = aijkok->i_dual.view_device();
1102   const auto &Aj     = aijkok->j_dual.view_device();
1103   if (ll) {
1104     PetscCall(VecGetLocalSize(ll, &m));
1105     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1106     PetscCall(VecGetKokkosView(ll, &lv));
1107     PetscCallCXX(Kokkos::parallel_for( // for each row
1108       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1109         PetscInt i   = t.league_rank(); // row i
1110         PetscInt len = Ai(i + 1) - Ai(i);
1111         // scale entries on the row
1112         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1113       }));
1114     PetscCall(VecRestoreKokkosView(ll, &lv));
1115     PetscCall(PetscLogGpuFlops(nz));
1116   }
1117   if (rr) {
1118     PetscCall(VecGetLocalSize(rr, &n));
1119     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1120     PetscCall(VecGetKokkosView(rr, &rv));
1121     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1122       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1123     PetscCall(VecRestoreKokkosView(rr, &lv));
1124     PetscCall(PetscLogGpuFlops(nz));
1125   }
1126   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1127   PetscCall(PetscLogGpuTimeEnd());
1128   PetscFunctionReturn(PETSC_SUCCESS);
1129 }
1130 
1131 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1132 {
1133   Mat_SeqAIJKokkos *aijkok;
1134 
1135   PetscFunctionBegin;
1136   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1137   if (aijkok) { /* Only zero the device if data is already there */
1138     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1139     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1140   } else { /* Might be preallocated but not assembled */
1141     PetscCall(MatZeroEntries_SeqAIJ(A));
1142   }
1143   PetscFunctionReturn(PETSC_SUCCESS);
1144 }
1145 
1146 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1147 {
1148   Mat_SeqAIJKokkos     *aijkok;
1149   PetscInt              n;
1150   PetscScalarKokkosView xv;
1151 
1152   PetscFunctionBegin;
1153   PetscCall(VecGetLocalSize(x, &n));
1154   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1155   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1156 
1157   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1158   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1159 
1160   const auto &Aa    = aijkok->a_dual.view_device();
1161   const auto &Ai    = aijkok->i_dual.view_device();
1162   const auto &Adiag = aijkok->diag_dual.view_device();
1163 
1164   PetscCall(VecGetKokkosViewWrite(x, &xv));
1165   Kokkos::parallel_for(
1166     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1167       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1168       else xv(i) = 0;
1169     });
1170   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1171   PetscFunctionReturn(PETSC_SUCCESS);
1172 }
1173 
1174 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1175 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1176 {
1177   Mat_SeqAIJKokkos *aijkok;
1178 
1179   PetscFunctionBegin;
1180   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1181   PetscAssertPointer(kv, 2);
1182   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1183   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1184   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1185   *kv    = aijkok->a_dual.view_device();
1186   PetscFunctionReturn(PETSC_SUCCESS);
1187 }
1188 
1189 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1190 {
1191   PetscFunctionBegin;
1192   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1193   PetscAssertPointer(kv, 2);
1194   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1195   PetscFunctionReturn(PETSC_SUCCESS);
1196 }
1197 
1198 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1199 {
1200   Mat_SeqAIJKokkos *aijkok;
1201 
1202   PetscFunctionBegin;
1203   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1204   PetscAssertPointer(kv, 2);
1205   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1206   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1207   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1208   *kv    = aijkok->a_dual.view_device();
1209   PetscFunctionReturn(PETSC_SUCCESS);
1210 }
1211 
1212 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1213 {
1214   PetscFunctionBegin;
1215   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1216   PetscAssertPointer(kv, 2);
1217   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1218   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1219   PetscFunctionReturn(PETSC_SUCCESS);
1220 }
1221 
1222 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1223 {
1224   Mat_SeqAIJKokkos *aijkok;
1225 
1226   PetscFunctionBegin;
1227   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1228   PetscAssertPointer(kv, 2);
1229   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1230   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1231   *kv    = aijkok->a_dual.view_device();
1232   PetscFunctionReturn(PETSC_SUCCESS);
1233 }
1234 
1235 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1236 {
1237   PetscFunctionBegin;
1238   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1239   PetscAssertPointer(kv, 2);
1240   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1241   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1242   PetscFunctionReturn(PETSC_SUCCESS);
1243 }
1244 
1245 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1246 {
1247   Mat_SeqAIJKokkos *akok;
1248 
1249   PetscFunctionBegin;
1250   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1251   PetscCall(MatCreate(comm, A));
1252   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1253   PetscFunctionReturn(PETSC_SUCCESS);
1254 }
1255 
1256 /* Computes Y += alpha X */
1257 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1258 {
1259   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1260   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1261   ConstMatScalarKokkosView Xa;
1262   MatScalarKokkosView      Ya;
1263   auto                     exec = PetscGetKokkosExecutionSpace();
1264 
1265   PetscFunctionBegin;
1266   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1267   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1268   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1269   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1270   PetscCall(PetscLogGpuTimeBegin());
1271 
1272   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1273     PetscBool e;
1274     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1275     if (e) {
1276       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1277       if (e) pattern = SAME_NONZERO_PATTERN;
1278     }
1279   }
1280 
1281   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1282     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1283     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1284     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1285   */
1286   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1287   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1288   Xa   = xkok->a_dual.view_device();
1289   Ya   = ykok->a_dual.view_device();
1290 
1291   if (pattern == SAME_NONZERO_PATTERN) {
1292     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1293     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1294   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1295     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1296     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1297 
1298     Kokkos::parallel_for(
1299       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1300         PetscInt i = t.league_rank(); // row i
1301         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1302           // Only one thread works in a team
1303           PetscInt p, q = Yi(i);
1304           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1305             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1306             if (Xj(p) == Yj(q)) {                        // Found it
1307               Ya(q) += alpha * Xa(p);
1308               q++;
1309             } else {
1310             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1311             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1312 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1313               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1314 #else
1315               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1316 #endif
1317             }
1318           }
1319         });
1320       });
1321     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1322   } else { // different nonzero patterns
1323     Mat             Z;
1324     KokkosCsrMatrix zcsr;
1325     KernelHandle    kh;
1326     kh.create_spadd_handle(true); // X, Y are sorted
1327     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1328     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1329     zkok = new Mat_SeqAIJKokkos(zcsr);
1330     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1331     PetscCall(MatHeaderReplace(Y, &Z));
1332     kh.destroy_spadd_handle();
1333   }
1334   PetscCall(PetscLogGpuTimeEnd());
1335   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1336   PetscFunctionReturn(PETSC_SUCCESS);
1337 }
1338 
1339 struct MatCOOStruct_SeqAIJKokkos {
1340   PetscCount           n;
1341   PetscCount           Atot;
1342   PetscInt             nz;
1343   PetscCountKokkosView jmap;
1344   PetscCountKokkosView perm;
1345 
1346   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1347   {
1348     nz   = coo_h->nz;
1349     n    = coo_h->n;
1350     Atot = coo_h->Atot;
1351     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1352     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1353   }
1354 };
1355 
1356 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1357 {
1358   PetscFunctionBegin;
1359   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1360   PetscFunctionReturn(PETSC_SUCCESS);
1361 }
1362 
1363 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1364 {
1365   Mat_SeqAIJKokkos          *akok;
1366   Mat_SeqAIJ                *aseq;
1367   PetscContainer             container_h;
1368   MatCOOStruct_SeqAIJ       *coo_h;
1369   MatCOOStruct_SeqAIJKokkos *coo_d;
1370 
1371   PetscFunctionBegin;
1372   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1373   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1374   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1375   delete akok;
1376   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1377   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1378 
1379   // Copy the COO struct to device
1380   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1381   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1382   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1383 
1384   // Put the COO struct in a container and then attach that to the matrix
1385   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1386   PetscFunctionReturn(PETSC_SUCCESS);
1387 }
1388 
1389 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1390 {
1391   MatScalarKokkosView        Aa;
1392   ConstMatScalarKokkosView   kv;
1393   PetscMemType               memtype;
1394   PetscContainer             container;
1395   MatCOOStruct_SeqAIJKokkos *coo;
1396 
1397   PetscFunctionBegin;
1398   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1399   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1400 
1401   const auto &n    = coo->n;
1402   const auto &Annz = coo->nz;
1403   const auto &jmap = coo->jmap;
1404   const auto &perm = coo->perm;
1405 
1406   PetscCall(PetscGetMemType(v, &memtype));
1407   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1408     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1409   } else {
1410     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1411   }
1412 
1413   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1414   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1415 
1416   PetscCall(PetscLogGpuTimeBegin());
1417   Kokkos::parallel_for(
1418     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1419       PetscScalar sum = 0.0;
1420       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1421       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1422     });
1423   PetscCall(PetscLogGpuTimeEnd());
1424 
1425   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1426   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1427   PetscFunctionReturn(PETSC_SUCCESS);
1428 }
1429 
1430 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1431 {
1432   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1433 
1434   PetscFunctionBegin;
1435   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1436   A->boundtocpu  = PETSC_FALSE;
1437 
1438   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1439   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1440   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1441   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1442   A->ops->scale                     = MatScale_SeqAIJKokkos;
1443   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1444   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1445   A->ops->mult                      = MatMult_SeqAIJKokkos;
1446   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1447   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1448   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1449   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1450   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1451   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1452   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1453   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1454   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1455   A->ops->shift                     = MatShift_SeqAIJKokkos;
1456   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1457   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1458   A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJKokkos;
1459   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1460   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1461   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1462   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1463   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1464   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1465   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1466 
1467   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1468   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1469 #if defined(PETSC_HAVE_HYPRE)
1470   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1471 #endif
1472   PetscFunctionReturn(PETSC_SUCCESS);
1473 }
1474 
1475 /*
1476    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1477 
1478   Input Parameters:
1479 +  A       - the MATSEQAIJKOKKOS matrix
1480 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1481 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1482 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1483 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1484 
1485   Output Parameter:
1486 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1487 */
1488 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1489 {
1490   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1491   PetscInt          nblocks = bs.extent(0) - 1;
1492 
1493   PetscFunctionBegin;
1494   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1495 
1496   // Pull out the diagonal blocks of the matrix and then invert the blocks
1497   auto Aa    = akok->a_dual.view_device();
1498   auto Ai    = akok->i_dual.view_device();
1499   auto Aj    = akok->j_dual.view_device();
1500   auto Adiag = akok->diag_dual.view_device();
1501   // TODO: how to tune the team size?
1502 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1503   auto ts = Kokkos::AUTO();
1504 #else
1505   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1506 #endif
1507   PetscCallCXX(Kokkos::parallel_for(
1508     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1509       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1510       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1511       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1512       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1513       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1514 
1515       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1516         PetscInt i = rstart + r;                                                            // i-th row in A
1517 
1518         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1519           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1520 
1521           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1522             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1523               B(r, c) = 0.0;
1524             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1525               B(r, c) = Aa(first + c);
1526             } else { // this entry does not show up in the CSR
1527               B(r, c) = 0.0;
1528             }
1529           }
1530         } else { // rare case that the diagonal does not exist
1531           const PetscInt begin = Ai(i);
1532           const PetscInt end   = Ai(i + 1);
1533           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1534           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1535             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1536             else if (Aj(j) >= rstart + m) break;
1537           }
1538         }
1539       });
1540 
1541       // LU-decompose B (w/o pivoting) and then invert B
1542       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1543       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1544     }));
1545   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1546   PetscFunctionReturn(PETSC_SUCCESS);
1547 }
1548 
1549 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1550 {
1551   Mat_SeqAIJ *aseq;
1552   PetscInt    i, m, n;
1553   auto        exec = PetscGetKokkosExecutionSpace();
1554 
1555   PetscFunctionBegin;
1556   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1557 
1558   m = akok->nrows();
1559   n = akok->ncols();
1560   PetscCall(MatSetSizes(A, m, n, m, n));
1561   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1562 
1563   /* Set up data structures of A as a MATSEQAIJ */
1564   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1565   aseq = (Mat_SeqAIJ *)A->data;
1566 
1567   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1568   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1569 
1570   aseq->i       = akok->i_host_data();
1571   aseq->j       = akok->j_host_data();
1572   aseq->a       = akok->a_host_data();
1573   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1574   aseq->free_a  = PETSC_FALSE;
1575   aseq->free_ij = PETSC_FALSE;
1576   aseq->nz      = akok->nnz();
1577   aseq->maxnz   = aseq->nz;
1578 
1579   PetscCall(PetscMalloc1(m, &aseq->imax));
1580   PetscCall(PetscMalloc1(m, &aseq->ilen));
1581   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1582 
1583   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1584   akok->nonzerostate = A->nonzerostate;
1585   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1586   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1587   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1588   PetscFunctionReturn(PETSC_SUCCESS);
1589 }
1590 
1591 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1592 {
1593   PetscFunctionBegin;
1594   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1595   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1596   PetscFunctionReturn(PETSC_SUCCESS);
1597 }
1598 
1599 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1600 {
1601   Mat_SeqAIJKokkos *akok;
1602 
1603   PetscFunctionBegin;
1604   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1605   PetscCall(MatCreate(comm, A));
1606   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1607   PetscFunctionReturn(PETSC_SUCCESS);
1608 }
1609 
1610 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1611 
1612    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1613  */
1614 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1615 {
1616   PetscFunctionBegin;
1617   PetscCall(MatCreate(comm, A));
1618   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1619   PetscFunctionReturn(PETSC_SUCCESS);
1620 }
1621 
1622 /*@C
1623   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1624   (the default parallel PETSc format). This matrix will ultimately be handled by
1625   Kokkos for calculations.
1626 
1627   Collective
1628 
1629   Input Parameters:
1630 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1631 . m    - number of rows
1632 . n    - number of columns
1633 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1634 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1635 
1636   Output Parameter:
1637 . A - the matrix
1638 
1639   Level: intermediate
1640 
1641   Notes:
1642   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1643   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1644   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1645 
1646   The AIJ format, also called
1647   compressed row storage, is fully compatible with standard Fortran
1648   storage.  That is, the stored row and column indices can begin at
1649   either one (as in Fortran) or zero.
1650 
1651   Specify the preallocated storage with either `nz` or `nnz` (not both).
1652   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1653   allocation.
1654 
1655 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1656 @*/
1657 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1658 {
1659   PetscFunctionBegin;
1660   PetscCall(PetscKokkosInitializeCheck());
1661   PetscCall(MatCreate(comm, A));
1662   PetscCall(MatSetSizes(*A, m, n, m, n));
1663   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1664   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1665   PetscFunctionReturn(PETSC_SUCCESS);
1666 }
1667 
1668 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1669 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1670 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1671 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1672 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1673 {
1674   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1675   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1676   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1677 
1678   PetscFunctionBegin;
1679   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1680     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1681     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1682     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1683   }
1684   PetscFunctionReturn(PETSC_SUCCESS);
1685 }
1686 
1687 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1688 {
1689   const PetscInt              n         = A->rmap->n;
1690   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1691   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1692   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1693 
1694   PetscFunctionBegin;
1695   if (!factors->transpose_updated) {
1696     if (has_upper) {
1697       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1698         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1699         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1700         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1701       }
1702 
1703       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1704         if (!factors->U) {
1705           Mat_SeqAIJ *seq;
1706 
1707           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1708           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1709 
1710           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1711           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1712           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1713           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1714         } else {
1715           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1716         }
1717         // Copy Ut from host to device
1718         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1719         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1720         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1721       } else { // If U was computed on device, we also compute the transpose there
1722         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1723         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1724                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1725                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1726                                                                                                                                                                                                                                factors->aUt_d));
1727         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1728       }
1729       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1730     }
1731 
1732     // do the same for L with LU
1733     if (has_lower) {
1734       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1735         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1736         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1737         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1738       }
1739 
1740       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1741         if (!factors->L) {
1742           Mat_SeqAIJ *seq;
1743 
1744           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1745           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1746 
1747           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1748           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1749           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1750           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1751         } else {
1752           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1753         }
1754         // Copy Lt from host to device
1755         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1756         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1757         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1758       } else { // If L was computed on device, we also compute the transpose there
1759         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1760         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1761                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1762                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1763                                                                                                                                                                                                                                factors->aLt_d));
1764         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1765       }
1766       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1767     }
1768 
1769     factors->transpose_updated = PETSC_TRUE;
1770   }
1771   PetscFunctionReturn(PETSC_SUCCESS);
1772 }
1773 
1774 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1775 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1776 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1777 {
1778   auto                        exec    = PetscGetKokkosExecutionSpace();
1779   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1780   PetscInt                    m       = A->rmap->n;
1781   PetscScalarKokkosView       D       = factors->D_d;
1782   PetscScalarKokkosView       X, Y, B; // alias
1783   ConstPetscScalarKokkosView  b;
1784   PetscScalarKokkosView       x;
1785   PetscIntKokkosView         &rowperm  = factors->rowperm;
1786   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1787 
1788   PetscFunctionBegin;
1789   PetscCall(PetscLogGpuTimeBegin());
1790   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1791   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1792   PetscCall(VecGetKokkosView(bb, &b));
1793   PetscCall(VecGetKokkosViewWrite(xx, &x));
1794 
1795   // Solve U^T Y = B
1796   if (identity) { // Reorder b with the row permutation
1797     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1798     Y = factors->workVector;
1799   } else {
1800     B = factors->workVector;
1801     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1802     Y = x;
1803   }
1804   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1805 
1806   // Solve diag(D) Y' = Y.
1807   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1808   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1809 
1810   // Solve UX = Y
1811   if (identity) {
1812     X = x;
1813   } else {
1814     X = factors->workVector; // B is not needed anymore
1815   }
1816   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1817 
1818   // Reorder X with the inverse column (row) permutation
1819   if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1820 
1821   PetscCall(VecRestoreKokkosView(bb, &b));
1822   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1823   PetscCall(PetscLogGpuTimeEnd());
1824   PetscFunctionReturn(PETSC_SUCCESS);
1825 }
1826 
1827 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1828 // R and C are represented by rowperm and colperm in factors.
1829 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1830 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1831 {
1832   auto                        exec    = PetscGetKokkosExecutionSpace();
1833   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1834   PetscInt                    m       = A->rmap->n;
1835   PetscScalarKokkosView       X, Y, B; // alias
1836   ConstPetscScalarKokkosView  b;
1837   PetscScalarKokkosView       x;
1838   PetscIntKokkosView         &rowperm      = factors->rowperm;
1839   PetscIntKokkosView         &colperm      = factors->colperm;
1840   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1841   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1842 
1843   PetscFunctionBegin;
1844   PetscCall(PetscLogGpuTimeBegin());
1845   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1846   PetscCall(VecGetKokkosView(bb, &b));
1847   PetscCall(VecGetKokkosViewWrite(xx, &x));
1848 
1849   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1850   if (row_identity) {
1851     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1852     Y = factors->workVector;
1853   } else {
1854     B = factors->workVector;
1855     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1856     Y = x;
1857   }
1858   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1859 
1860   // Solve U C^- x = Y
1861   if (col_identity) {
1862     X = x;
1863   } else {
1864     X = factors->workVector;
1865   }
1866   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1867 
1868   // x = C X; Reorder X with the inverse col permutation
1869   if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1870 
1871   PetscCall(VecRestoreKokkosView(bb, &b));
1872   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1873   PetscCall(PetscLogGpuTimeEnd());
1874   PetscFunctionReturn(PETSC_SUCCESS);
1875 }
1876 
1877 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1878 // R and C are represented by rowperm and colperm in factors.
1879 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1880 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1881 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1882 {
1883   auto                        exec    = PetscGetKokkosExecutionSpace();
1884   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1885   PetscInt                    m       = A->rmap->n;
1886   PetscScalarKokkosView       X, Y, B; // alias
1887   ConstPetscScalarKokkosView  b;
1888   PetscScalarKokkosView       x;
1889   PetscIntKokkosView         &rowperm      = factors->rowperm;
1890   PetscIntKokkosView         &colperm      = factors->colperm;
1891   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1892   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1893 
1894   PetscFunctionBegin;
1895   PetscCall(PetscLogGpuTimeBegin());
1896   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1897   PetscCall(VecGetKokkosView(bb, &b));
1898   PetscCall(VecGetKokkosViewWrite(xx, &x));
1899 
1900   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1901   if (col_identity) { // Reorder b with the col permutation
1902     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1903     Y = factors->workVector;
1904   } else {
1905     B = factors->workVector;
1906     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1907     Y = x;
1908   }
1909   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1910 
1911   // Solve L^T X = Y
1912   if (row_identity) {
1913     X = x;
1914   } else {
1915     X = factors->workVector;
1916   }
1917   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1918 
1919   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1920   if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1921 
1922   PetscCall(VecRestoreKokkosView(bb, &b));
1923   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1924   PetscCall(PetscLogGpuTimeEnd());
1925   PetscFunctionReturn(PETSC_SUCCESS);
1926 }
1927 
1928 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1929 {
1930   PetscFunctionBegin;
1931   PetscCall(MatSeqAIJKokkosSyncHost(A));
1932   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1933 
1934   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1935     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1936     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1937     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1938     const MatScalar            *Ba = b->a;
1939     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1940 
1941     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1942       // Allocate memory and copy the structure
1943       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1944       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1945       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1946       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1947       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1948       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1949 
1950       PetscInt *Li = factors->iL_h.data();
1951       PetscInt *Lj = factors->jL_h.data();
1952       PetscInt *Ui = factors->iU_h.data();
1953       PetscInt *Uj = factors->jU_h.data();
1954 
1955       Li[0] = Ui[0] = 0;
1956       for (PetscInt i = 0; i < m; i++) {
1957         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1958         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1959 
1960         PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal
1961         Lj[Li[i] + llen] = i;                                   // diagonal entry of L
1962 
1963         Uj[Ui[i]] = i;                                                             // diagonal entry of U
1964         PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on  the right of the diagonal
1965 
1966         Li[i + 1] = Li[i] + llen + 1;
1967         Ui[i + 1] = Ui[i] + ulen;
1968       }
1969 
1970       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1971       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1972       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1973       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1974       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1975       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1976 
1977       // Copy row/col permutation to device
1978       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1979       PetscBool row_identity;
1980       PetscCall(ISIdentity(rowperm, &row_identity));
1981       if (!row_identity) {
1982         const PetscInt *ip;
1983 
1984         PetscCall(ISGetIndices(rowperm, &ip));
1985         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1986         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1987         PetscCall(ISRestoreIndices(rowperm, &ip));
1988         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1989       }
1990 
1991       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1992       PetscBool col_identity;
1993       PetscCall(ISIdentity(colperm, &col_identity));
1994       if (!col_identity) {
1995         const PetscInt *ip;
1996 
1997         PetscCall(ISGetIndices(colperm, &ip));
1998         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
1999         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2000         PetscCall(ISRestoreIndices(colperm, &ip));
2001         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2002       }
2003 
2004       /* Create sptrsv handles for L, U and their transpose */
2005 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2006       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2007 #else
2008       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2009 #endif
2010       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2011       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2012       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2013       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2014     }
2015 
2016     // Copy the value
2017     for (PetscInt i = 0; i < m; i++) {
2018       PetscInt        llen = Bi[i + 1] - Bi[i];
2019       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2020       const PetscInt *Li   = factors->iL_h.data();
2021       const PetscInt *Ui   = factors->iU_h.data();
2022 
2023       PetscScalar *La = factors->aL_h.data();
2024       PetscScalar *Ua = factors->aU_h.data();
2025 
2026       PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L
2027       La[Li[i] + llen] = 1.0;                                 // diagonal entry
2028 
2029       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                            // diagonal entry
2030       PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U
2031     }
2032 
2033     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2034     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2035     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2036     factors->transpose_updated         = PETSC_FALSE;
2037     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2038 
2039     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2040     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2041   }
2042 
2043   B->ops->matsolve          = NULL;
2044   B->ops->matsolvetranspose = NULL;
2045   PetscFunctionReturn(PETSC_SUCCESS);
2046 }
2047 
2048 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2049 {
2050   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2051   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2052   PetscInt                    fill_lev = info->levels;
2053 
2054   PetscFunctionBegin;
2055   PetscCall(PetscLogGpuTimeBegin());
2056   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2057   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2058 
2059   auto a_d = aijkok->a_dual.view_device();
2060   auto i_d = aijkok->i_dual.view_device();
2061   auto j_d = aijkok->j_dual.view_device();
2062 
2063   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2064 
2065   B->assembled              = PETSC_TRUE;
2066   B->preallocated           = PETSC_TRUE;
2067   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2068   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2069   B->ops->matsolve          = NULL;
2070   B->ops->matsolvetranspose = NULL;
2071 
2072   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2073   factors->transpose_updated         = PETSC_FALSE;
2074   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2075   /* TODO: log flops, but how to know that? */
2076   PetscCall(PetscLogGpuTimeEnd());
2077   PetscFunctionReturn(PETSC_SUCCESS);
2078 }
2079 
2080 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2081 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2082 {
2083   Mat_SeqAIJKokkos           *aijkok;
2084   Mat_SeqAIJ                 *b;
2085   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2086   PetscInt                    fill_lev = info->levels;
2087   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2088   PetscInt                    n        = A->rmap->n;
2089 
2090   PetscFunctionBegin;
2091   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2092   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2093 
2094   /* Create a spiluk handle and then do symbolic factorization */
2095   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2096   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2097 
2098   auto spiluk_handle = factors->kh.get_spiluk_handle();
2099 
2100   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2101   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2102   Kokkos::realloc(factors->iU_d, n + 1);
2103   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2104 
2105   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2106   auto i_d = aijkok->i_dual.view_device();
2107   auto j_d = aijkok->j_dual.view_device();
2108   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2109   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2110 
2111   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2112   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2113   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2114   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2115 
2116   /* TODO: add options to select sptrsv algorithms */
2117   /* Create sptrsv handles for L, U and their transpose */
2118 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2119   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2120 #else
2121   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2122 #endif
2123 
2124   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2125   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2126   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2127   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2128 
2129   /* Fill fields of the factor matrix B */
2130   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2131   b     = (Mat_SeqAIJ *)B->data;
2132   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2133   B->info.fill_ratio_given  = info->fill;
2134   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2135 
2136   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2137   PetscFunctionReturn(PETSC_SUCCESS);
2138 }
2139 
2140 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2141 {
2142   PetscFunctionBegin;
2143   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2144   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2145   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2146   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2147   PetscFunctionReturn(PETSC_SUCCESS);
2148 }
2149 
2150 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2151 {
2152   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2153 
2154   PetscFunctionBegin;
2155   if (!info->factoronhost) {
2156     PetscCall(ISIdentity(isrow, &row_identity));
2157     PetscCall(ISIdentity(iscol, &col_identity));
2158   }
2159 
2160   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2161   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2162 
2163   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2164     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2165   } else {
2166     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2167     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2168   }
2169   PetscFunctionReturn(PETSC_SUCCESS);
2170 }
2171 
2172 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2173 {
2174   PetscFunctionBegin;
2175   PetscCall(MatSeqAIJKokkosSyncHost(A));
2176   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2177 
2178   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2179     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2180     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2181     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2182     const MatScalar            *Ba = b->a;
2183     PetscInt                    m  = B->rmap->n;
2184 
2185     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2186       // Allocate memory and copy the structure
2187       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2188       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2189       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2190       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2191       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2192       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2193 
2194       // Build jU_h from the skewed Aj
2195       PetscInt *Uj = factors->jU_h.data();
2196       for (PetscInt i = 0; i < m; i++) {
2197         PetscInt ulen = Bi[i + 1] - Bi[i];
2198         Uj[Bi[i]]     = i;                                              // diagonal entry
2199         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2200       }
2201 
2202       // Copy iU, jU to device
2203       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2204       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2205 
2206       // Copy row/col permutation to device
2207       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2208       PetscBool row_identity;
2209       PetscCall(ISIdentity(rowperm, &row_identity));
2210       if (!row_identity) {
2211         const PetscInt *ip;
2212 
2213         PetscCall(ISGetIndices(rowperm, &ip));
2214         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2215         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2216         PetscCall(ISRestoreIndices(rowperm, &ip));
2217         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2218       }
2219 
2220       // Create sptrsv handles for U and U^T
2221 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2222       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2223 #else
2224       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2225 #endif
2226       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2227       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2228     }
2229     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2230     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2231     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2232 
2233     // Copy the value
2234     PetscScalar *Ua = factors->aU_h.data();
2235     PetscScalar *D  = factors->D_h.data();
2236     for (PetscInt i = 0; i < m; i++) {
2237       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2238       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2239       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2240     }
2241     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2242     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2243 
2244     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2245     factors->transpose_updated         = PETSC_FALSE;
2246   }
2247 
2248   B->ops->matsolve          = NULL;
2249   B->ops->matsolvetranspose = NULL;
2250   PetscFunctionReturn(PETSC_SUCCESS);
2251 }
2252 
2253 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2254 {
2255   PetscFunctionBegin;
2256   if (info->solveonhost) {
2257     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2258     PetscCall(MatSetType(B, MATSEQSBAIJ));
2259     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2260   }
2261 
2262   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2263 
2264   if (!info->solveonhost) {
2265     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2266     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2267     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2268     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2269   }
2270   PetscFunctionReturn(PETSC_SUCCESS);
2271 }
2272 
2273 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2274 {
2275   PetscFunctionBegin;
2276   if (info->solveonhost) {
2277     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2278     PetscCall(MatSetType(B, MATSEQSBAIJ));
2279     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2280   }
2281 
2282   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2283 
2284   if (!info->solveonhost) {
2285     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2286     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2287     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2288     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2289   }
2290   PetscFunctionReturn(PETSC_SUCCESS);
2291 }
2292 
2293 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2294 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2295 {
2296   PetscFunctionBegin;
2297   *type = MATSOLVERKOKKOS;
2298   PetscFunctionReturn(PETSC_SUCCESS);
2299 }
2300 
2301 /*MC
2302   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2303   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2304 
2305   Level: beginner
2306 
2307 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`
2308 M*/
2309 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2310 {
2311   PetscInt n = A->rmap->n;
2312   MPI_Comm comm;
2313 
2314   PetscFunctionBegin;
2315   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2316   PetscCall(MatCreate(comm, B));
2317   PetscCall(MatSetSizes(*B, n, n, n, n));
2318   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2319   (*B)->factortype = ftype;
2320   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2321   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2322   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2323 
2324   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2325     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2326     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2327     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2328     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2329     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2330   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2331     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2332     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2333     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2334     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2335   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2336 
2337   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2338   (*B)->canuseordering = PETSC_TRUE;
2339   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2340   PetscFunctionReturn(PETSC_SUCCESS);
2341 }
2342 
2343 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2344 {
2345   PetscFunctionBegin;
2346   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2347   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2348   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2349   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2350   PetscFunctionReturn(PETSC_SUCCESS);
2351 }
2352 
2353 /* Utility to print out a KokkosCsrMatrix for debugging */
2354 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2355 {
2356   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2357   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2358   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2359   const PetscInt    *i  = iv.data();
2360   const PetscInt    *j  = jv.data();
2361   const PetscScalar *a  = av.data();
2362   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2363 
2364   PetscFunctionBegin;
2365   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2366   for (PetscInt k = 0; k < m; k++) {
2367     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2368     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2369     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2370   }
2371   PetscFunctionReturn(PETSC_SUCCESS);
2372 }
2373