xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 7b6b5789786701047fcfdde31f6cf09bc2f63ee9)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20 #include <KokkosSparse_spmv.hpp>
21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22 
23 #include <KokkosSparse_spiluk.hpp>
24 #include <KokkosSparse_sptrsv.hpp>
25 #include <KokkosSparse_spgemm.hpp>
26 #include <KokkosSparse_spadd.hpp>
27 #include <KokkosBatched_LU_Decl.hpp>
28 #include <KokkosBatched_InverseLU_Decl.hpp>
29 
30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31 
32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33   #include <KokkosSparse_Utils.hpp>
34 using KokkosSparse::sort_crs_matrix;
35 using KokkosSparse::Impl::transpose_matrix;
36 #else
37   #include <KokkosKernels_Sorting.hpp>
38 using KokkosKernels::sort_crs_matrix;
39 using KokkosKernels::Impl::transpose_matrix;
40 #endif
41 
42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43 using KokkosSparse::spiluk_symbolic;
44 using KokkosSparse::spiluk_numeric;
45 using KokkosSparse::sptrsv_symbolic;
46 using KokkosSparse::sptrsv_solve;
47 using KokkosSparse::Experimental::SPTRSVAlgorithm;
48 using KokkosSparse::Experimental::SPILUKAlgorithm;
49 #else
50 using KokkosSparse::Experimental::spiluk_symbolic;
51 using KokkosSparse::Experimental::spiluk_numeric;
52 using KokkosSparse::Experimental::sptrsv_symbolic;
53 using KokkosSparse::Experimental::sptrsv_solve;
54 using KokkosSparse::Experimental::SPTRSVAlgorithm;
55 using KokkosSparse::Experimental::SPILUKAlgorithm;
56 #endif
57 
58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59 
60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62    In the latter case, it is important to set a_dual's sync state correctly.
63  */
64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65 {
66   Mat_SeqAIJ       *aijseq;
67   Mat_SeqAIJKokkos *aijkok;
68 
69   PetscFunctionBegin;
70   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72 
73   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75 
76   /* If aijkok does not exist, we just copy i, j to device.
77      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78      In both cases, we build a new aijkok structure.
79   */
80   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidently freeing much needed a,i,j on host when deleting aijkok */
82       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
83       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
84       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
85       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
86       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
87       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
88       aijseq->free_a  = PETSC_TRUE;
89       aijseq->free_ij = PETSC_TRUE;
90       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
91          we have the CsrMatrix on device first and then copy to host, followed by
92          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
93          One could improve it by not using NULL garray.
94       */
95     }
96     delete aijkok;
97     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
98     A->spptr = aijkok;
99   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
100     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
101     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
102     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
103   }
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 
107 /* Sync CSR data to device if not yet */
108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111 
112   PetscFunctionBegin;
113   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
114   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
115   if (aijkok->a_dual.need_sync_device()) {
116     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
117     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
118     aijkok->hermitian_updated = PETSC_FALSE;
119   }
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 /* Mark the CSR data on device as modified */
124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127 
128   PetscFunctionBegin;
129   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
130   aijkok->a_dual.clear_sync_state();
131   aijkok->a_dual.modify_device();
132   aijkok->transpose_updated = PETSC_FALSE;
133   aijkok->hermitian_updated = PETSC_FALSE;
134   PetscCall(MatSeqAIJInvalidateDiagonal(A));
135   PetscCall(PetscObjectStateIncrease((PetscObject)A));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140 {
141   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142   auto              exec   = PetscGetKokkosExecutionSpace();
143 
144   PetscFunctionBegin;
145   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
146   /* We do not expect one needs factors on host  */
147   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
148   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
150   PetscFunctionReturn(PETSC_SUCCESS);
151 }
152 
153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154 {
155   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
156 
157   PetscFunctionBegin;
158   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
159     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
160     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
161     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
162   */
163   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
165     *array = aijkok->a_dual.view_host().data();
166   } else { /* Happens when calling MatSetValues on a newly created matrix */
167     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
168   }
169   PetscFunctionReturn(PETSC_SUCCESS);
170 }
171 
172 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
173 {
174   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
175 
176   PetscFunctionBegin;
177   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
178   PetscFunctionReturn(PETSC_SUCCESS);
179 }
180 
181 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
182 {
183   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
184 
185   PetscFunctionBegin;
186   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
187     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
188     *array = aijkok->a_dual.view_host().data();
189   } else {
190     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
191   }
192   PetscFunctionReturn(PETSC_SUCCESS);
193 }
194 
195 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
196 {
197   PetscFunctionBegin;
198   *array = NULL;
199   PetscFunctionReturn(PETSC_SUCCESS);
200 }
201 
202 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
203 {
204   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
205 
206   PetscFunctionBegin;
207   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
208     *array = aijkok->a_dual.view_host().data();
209   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
210     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
211   }
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 
215 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
216 {
217   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
218 
219   PetscFunctionBegin;
220   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
221     aijkok->a_dual.clear_sync_state();
222     aijkok->a_dual.modify_host();
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
228 {
229   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
230 
231   PetscFunctionBegin;
232   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
233 
234   if (i) *i = aijkok->i_device_data();
235   if (j) *j = aijkok->j_device_data();
236   if (a) {
237     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
238     *a = aijkok->a_device_data();
239   }
240   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243 
244 /*
245   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
246 
247   Input Parameter:
248 .  A       - the MATSEQAIJKOKKOS matrix
249 
250   Output Parameters:
251 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
252 -  T_d    - the transpose on device, whose value array is allocated but not initialized
253 */
254 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
255 {
256   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
257   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
258   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
259   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
260   MatRowMapType          *Ti = Ti_h.data();
261   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
262   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
263   PetscInt               *Tj   = Tj_h.data();
264   PetscInt               *perm = perm_h.data();
265   PetscInt               *offset;
266 
267   PetscFunctionBegin;
268   // Populate Ti
269   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
270   Ti++;
271   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
272   Ti--;
273   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
274 
275   // Populate Tj and the permutation array
276   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
277   for (PetscInt i = 0; i < m; i++) {
278     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
279       PetscInt r    = Aj[j];                       // row r of T
280       PetscInt disp = Ti[r] + offset[r];
281 
282       Tj[disp]   = i; // col i of T
283       perm[disp] = j;
284       offset[r]++;
285     }
286   }
287   PetscCall(PetscFree(offset));
288 
289   // Sort each row of T, along with the permutation array
290   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
291 
292   // Output perm and T on device
293   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
294   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
295   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
296   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
297   PetscFunctionReturn(PETSC_SUCCESS);
298 }
299 
300 // Generate the transpose on device and cache it internally
301 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
302 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
303 {
304   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
305   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
306   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
307   KokkosCsrMatrix  &T = akok->csrmatT;
308 
309   PetscFunctionBegin;
310   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
311   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
312 
313   const auto &Aa = akok->a_dual.view_device();
314 
315   if (A->symmetric == PETSC_BOOL3_TRUE) {
316     *csrmatT = akok->csrmat;
317   } else {
318     // See if we already have a cached transpose and its value is up to date
319     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
320       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
321         const auto &perm = akok->transpose_perm; // get the permutation array
322         auto       &Ta   = T.values;
323 
324         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
325       }
326     } else { // Generate T of size n x m for the first time
327       MatRowMapKokkosView perm;
328 
329       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
330       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
331       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
332     }
333     akok->transpose_updated = PETSC_TRUE;
334     *csrmatT                = akok->csrmatT;
335   }
336   PetscFunctionReturn(PETSC_SUCCESS);
337 }
338 
339 // Generate the Hermitian on device and cache it internally
340 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
341 {
342   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
343   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
344   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
345   KokkosCsrMatrix  &T = akok->csrmatH;
346 
347   PetscFunctionBegin;
348   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
349   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
350 
351   const auto &Aa = akok->a_dual.view_device();
352 
353   if (A->hermitian == PETSC_BOOL3_TRUE) {
354     *csrmatH = akok->csrmat;
355   } else {
356     // See if we already have a cached hermitian and its value is up to date
357     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
358       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
359         const auto &perm = akok->transpose_perm; // get the permutation array
360         auto       &Ta   = T.values;
361 
362         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
363       }
364     } else { // Generate T of size n x m for the first time
365       MatRowMapKokkosView perm;
366 
367       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
368       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
369       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
370     }
371     akok->hermitian_updated = PETSC_TRUE;
372     *csrmatH                = akok->csrmatH;
373   }
374   PetscFunctionReturn(PETSC_SUCCESS);
375 }
376 
377 /* y = A x */
378 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
379 {
380   Mat_SeqAIJKokkos          *aijkok;
381   ConstPetscScalarKokkosView xv;
382   PetscScalarKokkosView      yv;
383 
384   PetscFunctionBegin;
385   PetscCall(PetscLogGpuTimeBegin());
386   PetscCall(MatSeqAIJKokkosSyncDevice(A));
387   PetscCall(VecGetKokkosView(xx, &xv));
388   PetscCall(VecGetKokkosViewWrite(yy, &yv));
389   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
390   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
391   PetscCall(VecRestoreKokkosView(xx, &xv));
392   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
393   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
394   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
395   PetscCall(PetscLogGpuTimeEnd());
396   PetscFunctionReturn(PETSC_SUCCESS);
397 }
398 
399 /* y = A^T x */
400 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
401 {
402   Mat_SeqAIJKokkos          *aijkok;
403   const char                *mode;
404   ConstPetscScalarKokkosView xv;
405   PetscScalarKokkosView      yv;
406   KokkosCsrMatrix            csrmat;
407 
408   PetscFunctionBegin;
409   PetscCall(PetscLogGpuTimeBegin());
410   PetscCall(MatSeqAIJKokkosSyncDevice(A));
411   PetscCall(VecGetKokkosView(xx, &xv));
412   PetscCall(VecGetKokkosViewWrite(yy, &yv));
413   if (A->form_explicit_transpose) {
414     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
415     mode = "N";
416   } else {
417     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
418     csrmat = aijkok->csrmat;
419     mode   = "T";
420   }
421   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
422   PetscCall(VecRestoreKokkosView(xx, &xv));
423   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
424   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
425   PetscCall(PetscLogGpuTimeEnd());
426   PetscFunctionReturn(PETSC_SUCCESS);
427 }
428 
429 /* y = A^H x */
430 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
431 {
432   Mat_SeqAIJKokkos          *aijkok;
433   const char                *mode;
434   ConstPetscScalarKokkosView xv;
435   PetscScalarKokkosView      yv;
436   KokkosCsrMatrix            csrmat;
437 
438   PetscFunctionBegin;
439   PetscCall(PetscLogGpuTimeBegin());
440   PetscCall(MatSeqAIJKokkosSyncDevice(A));
441   PetscCall(VecGetKokkosView(xx, &xv));
442   PetscCall(VecGetKokkosViewWrite(yy, &yv));
443   if (A->form_explicit_transpose) {
444     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
445     mode = "N";
446   } else {
447     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
448     csrmat = aijkok->csrmat;
449     mode   = "C";
450   }
451   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
452   PetscCall(VecRestoreKokkosView(xx, &xv));
453   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
454   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
455   PetscCall(PetscLogGpuTimeEnd());
456   PetscFunctionReturn(PETSC_SUCCESS);
457 }
458 
459 /* z = A x + y */
460 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
461 {
462   Mat_SeqAIJKokkos          *aijkok;
463   ConstPetscScalarKokkosView xv;
464   PetscScalarKokkosView      zv;
465 
466   PetscFunctionBegin;
467   PetscCall(PetscLogGpuTimeBegin());
468   PetscCall(MatSeqAIJKokkosSyncDevice(A));
469   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
470   PetscCall(VecGetKokkosView(xx, &xv));
471   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
472   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
473   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
474   PetscCall(VecRestoreKokkosView(xx, &xv));
475   PetscCall(VecRestoreKokkosView(zz, &zv));
476   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
477   PetscCall(PetscLogGpuTimeEnd());
478   PetscFunctionReturn(PETSC_SUCCESS);
479 }
480 
481 /* z = A^T x + y */
482 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
483 {
484   Mat_SeqAIJKokkos          *aijkok;
485   const char                *mode;
486   ConstPetscScalarKokkosView xv;
487   PetscScalarKokkosView      zv;
488   KokkosCsrMatrix            csrmat;
489 
490   PetscFunctionBegin;
491   PetscCall(PetscLogGpuTimeBegin());
492   PetscCall(MatSeqAIJKokkosSyncDevice(A));
493   if (zz != yy) PetscCall(VecCopy(yy, zz));
494   PetscCall(VecGetKokkosView(xx, &xv));
495   PetscCall(VecGetKokkosView(zz, &zv));
496   if (A->form_explicit_transpose) {
497     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
498     mode = "N";
499   } else {
500     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
501     csrmat = aijkok->csrmat;
502     mode   = "T";
503   }
504   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
505   PetscCall(VecRestoreKokkosView(xx, &xv));
506   PetscCall(VecRestoreKokkosView(zz, &zv));
507   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
508   PetscCall(PetscLogGpuTimeEnd());
509   PetscFunctionReturn(PETSC_SUCCESS);
510 }
511 
512 /* z = A^H x + y */
513 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
514 {
515   Mat_SeqAIJKokkos          *aijkok;
516   const char                *mode;
517   ConstPetscScalarKokkosView xv;
518   PetscScalarKokkosView      zv;
519   KokkosCsrMatrix            csrmat;
520 
521   PetscFunctionBegin;
522   PetscCall(PetscLogGpuTimeBegin());
523   PetscCall(MatSeqAIJKokkosSyncDevice(A));
524   if (zz != yy) PetscCall(VecCopy(yy, zz));
525   PetscCall(VecGetKokkosView(xx, &xv));
526   PetscCall(VecGetKokkosView(zz, &zv));
527   if (A->form_explicit_transpose) {
528     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
529     mode = "N";
530   } else {
531     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
532     csrmat = aijkok->csrmat;
533     mode   = "C";
534   }
535   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
536   PetscCall(VecRestoreKokkosView(xx, &xv));
537   PetscCall(VecRestoreKokkosView(zz, &zv));
538   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
539   PetscCall(PetscLogGpuTimeEnd());
540   PetscFunctionReturn(PETSC_SUCCESS);
541 }
542 
543 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
544 {
545   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
546 
547   PetscFunctionBegin;
548   switch (op) {
549   case MAT_FORM_EXPLICIT_TRANSPOSE:
550     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
551     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
552     A->form_explicit_transpose = flg;
553     break;
554   default:
555     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
556     break;
557   }
558   PetscFunctionReturn(PETSC_SUCCESS);
559 }
560 
561 /* Depending on reuse, either build a new mat, or use the existing mat */
562 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
563 {
564   Mat_SeqAIJ *aseq;
565 
566   PetscFunctionBegin;
567   PetscCall(PetscKokkosInitializeCheck());
568   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
569     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
570     PetscCall(MatSetType(*newmat, mtype));
571   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
572     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
573   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
574     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
575     PetscCall(PetscFree(A->defaultvectype));
576     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
577     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
578     PetscCall(MatSetOps_SeqAIJKokkos(A));
579     aseq = static_cast<Mat_SeqAIJ *>(A->data);
580     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
581       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
582       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
583     }
584   }
585   PetscFunctionReturn(PETSC_SUCCESS);
586 }
587 
588 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
589    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
590  */
591 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
592 {
593   Mat_SeqAIJ       *bseq;
594   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
595   Mat               mat;
596 
597   PetscFunctionBegin;
598   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
599   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
600   mat = *B;
601   if (A->assembled) {
602     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
603     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
604     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
605     /* Now copy values to B if needed */
606     if (dupOption == MAT_COPY_VALUES) {
607       if (akok->a_dual.need_sync_device()) {
608         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
609         bkok->a_dual.modify_host();
610       } else { /* If device has the latest data, we only copy data on device */
611         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
612         bkok->a_dual.modify_device();
613       }
614     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
615       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
616       bkok->a_dual.modify_host();
617     }
618     mat->spptr = bkok;
619   }
620 
621   PetscCall(PetscFree(mat->defaultvectype));
622   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
623   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
624   PetscCall(MatSetOps_SeqAIJKokkos(mat));
625   PetscFunctionReturn(PETSC_SUCCESS);
626 }
627 
628 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
629 {
630   Mat               At;
631   KokkosCsrMatrix   internT;
632   Mat_SeqAIJKokkos *atkok, *bkok;
633 
634   PetscFunctionBegin;
635   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
636   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
637   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
638     /* Deep copy internT, as we want to isolate the internal transpose */
639     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
640     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
641     if (reuse == MAT_INITIAL_MATRIX) *B = At;
642     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
643   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
644     if ((*B)->assembled) {
645       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
646       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
647       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
648     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
649       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
650       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
651       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
652       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
653       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
654     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
655   }
656   PetscFunctionReturn(PETSC_SUCCESS);
657 }
658 
659 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
660 {
661   Mat_SeqAIJKokkos *aijkok;
662 
663   PetscFunctionBegin;
664   if (A->factortype == MAT_FACTOR_NONE) {
665     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
666     delete aijkok;
667   } else {
668     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
669   }
670   A->spptr = NULL;
671   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
672   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
673   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
674 #if defined(PETSC_HAVE_HYPRE)
675   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
676 #endif
677   PetscCall(MatDestroy_SeqAIJ(A));
678   PetscFunctionReturn(PETSC_SUCCESS);
679 }
680 
681 /*MC
682    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
683 
684    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
685 
686    Options Database Key:
687 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
688 
689   Level: beginner
690 
691 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
692 M*/
693 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
694 {
695   PetscFunctionBegin;
696   PetscCall(PetscKokkosInitializeCheck());
697   PetscCall(MatCreate_SeqAIJ(A));
698   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
699   PetscFunctionReturn(PETSC_SUCCESS);
700 }
701 
702 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
703 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
704 {
705   Mat_SeqAIJ         *a, *b;
706   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
707   MatScalarKokkosView aa, ba, ca;
708   MatRowMapKokkosView ai, bi, ci;
709   MatColIdxKokkosView aj, bj, cj;
710   PetscInt            m, n, nnz, aN;
711 
712   PetscFunctionBegin;
713   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
714   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
715   PetscAssertPointer(C, 4);
716   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
717   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
718   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
719   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
720 
721   PetscCall(MatSeqAIJKokkosSyncDevice(A));
722   PetscCall(MatSeqAIJKokkosSyncDevice(B));
723   a    = static_cast<Mat_SeqAIJ *>(A->data);
724   b    = static_cast<Mat_SeqAIJ *>(B->data);
725   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
726   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
727   aa   = akok->a_dual.view_device();
728   ai   = akok->i_dual.view_device();
729   ba   = bkok->a_dual.view_device();
730   bi   = bkok->i_dual.view_device();
731   m    = A->rmap->n; /* M, N and nnz of C */
732   n    = A->cmap->n + B->cmap->n;
733   nnz  = a->nz + b->nz;
734   aN   = A->cmap->n; /* N of A */
735   if (reuse == MAT_INITIAL_MATRIX) {
736     aj           = akok->j_dual.view_device();
737     bj           = bkok->j_dual.view_device();
738     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
739     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
740     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
741     ca           = ca_dual.view_device();
742     ci           = ci_dual.view_device();
743     cj           = cj_dual.view_device();
744 
745     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
746     Kokkos::parallel_for(
747       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
748         PetscInt i       = t.league_rank(); /* row i */
749         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
750 
751         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
752                                                    ci(i) = coffset;
753                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
754         });
755 
756         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
757           if (k < alen) {
758             ca(coffset + k) = aa(ai(i) + k);
759             cj(coffset + k) = aj(ai(i) + k);
760           } else {
761             ca(coffset + k) = ba(bi(i) + k - alen);
762             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
763           }
764         });
765       });
766     ca_dual.modify_device();
767     ci_dual.modify_device();
768     cj_dual.modify_device();
769     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
770     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
771   } else if (reuse == MAT_REUSE_MATRIX) {
772     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
773     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
774     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
775     ca   = ckok->a_dual.view_device();
776     ci   = ckok->i_dual.view_device();
777 
778     Kokkos::parallel_for(
779       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
780         PetscInt i    = t.league_rank(); /* row i */
781         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
782         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
783           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
784           else ca(ci(i) + k) = ba(bi(i) + k - alen);
785         });
786       });
787     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
788   }
789   PetscFunctionReturn(PETSC_SUCCESS);
790 }
791 
792 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
793 {
794   PetscFunctionBegin;
795   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
796   PetscFunctionReturn(PETSC_SUCCESS);
797 }
798 
799 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
800 {
801   Mat_Product                 *product = C->product;
802   Mat                          A, B;
803   bool                         transA, transB; /* use bool, since KK needs this type */
804   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
805   Mat_SeqAIJ                  *c;
806   MatProductData_SeqAIJKokkos *pdata;
807   KokkosCsrMatrix              csrmatA, csrmatB;
808 
809   PetscFunctionBegin;
810   MatCheckProduct(C, 1);
811   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
812   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
813 
814   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
815   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
816   // we still do numeric.
817   if (pdata->reusesym) { // numeric reuses results from symbolic
818     pdata->reusesym = PETSC_FALSE;
819     PetscFunctionReturn(PETSC_SUCCESS);
820   }
821 
822   switch (product->type) {
823   case MATPRODUCT_AB:
824     transA = false;
825     transB = false;
826     break;
827   case MATPRODUCT_AtB:
828     transA = true;
829     transB = false;
830     break;
831   case MATPRODUCT_ABt:
832     transA = false;
833     transB = true;
834     break;
835   default:
836     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
837   }
838 
839   A = product->A;
840   B = product->B;
841   PetscCall(MatSeqAIJKokkosSyncDevice(A));
842   PetscCall(MatSeqAIJKokkosSyncDevice(B));
843   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
844   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
845   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
846 
847   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
848 
849   csrmatA = akok->csrmat;
850   csrmatB = bkok->csrmat;
851 
852   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
853   if (transA) {
854     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
855     transA = false;
856   }
857 
858   if (transB) {
859     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
860     transB = false;
861   }
862   PetscCall(PetscLogGpuTimeBegin());
863   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
864 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
865   auto spgemmHandle = pdata->kh.get_spgemm_handle();
866   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
867 #endif
868 
869   PetscCall(PetscLogGpuTimeEnd());
870   PetscCall(MatSeqAIJKokkosModifyDevice(C));
871   /* shorter version of MatAssemblyEnd_SeqAIJ */
872   c = (Mat_SeqAIJ *)C->data;
873   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
874   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
875   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
876   c->reallocs         = 0;
877   C->info.mallocs     = 0;
878   C->info.nz_unneeded = 0;
879   C->assembled = C->was_assembled = PETSC_TRUE;
880   C->num_ass++;
881   PetscFunctionReturn(PETSC_SUCCESS);
882 }
883 
884 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
885 {
886   Mat_Product                 *product = C->product;
887   MatProductType               ptype;
888   Mat                          A, B;
889   bool                         transA, transB;
890   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
891   MatProductData_SeqAIJKokkos *pdata;
892   MPI_Comm                     comm;
893   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
894 
895   PetscFunctionBegin;
896   MatCheckProduct(C, 1);
897   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
898   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
899   A = product->A;
900   B = product->B;
901   PetscCall(MatSeqAIJKokkosSyncDevice(A));
902   PetscCall(MatSeqAIJKokkosSyncDevice(B));
903   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
904   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
905   csrmatA = akok->csrmat;
906   csrmatB = bkok->csrmat;
907 
908   ptype = product->type;
909   // Take advantage of the symmetry if true
910   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
911     ptype                                          = MATPRODUCT_AB;
912     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
913   }
914   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
915     ptype                                          = MATPRODUCT_AB;
916     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
917   }
918 
919   switch (ptype) {
920   case MATPRODUCT_AB:
921     transA = false;
922     transB = false;
923     PetscCall(MatSetBlockSizesFromMats(C, A, B));
924     break;
925   case MATPRODUCT_AtB:
926     transA = true;
927     transB = false;
928     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
929     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
930     break;
931   case MATPRODUCT_ABt:
932     transA = false;
933     transB = true;
934     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
935     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
936     break;
937   default:
938     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
939   }
940   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
941   pdata->reusesym = product->api_user;
942 
943   /* TODO: add command line options to select spgemm algorithms */
944   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
945 
946   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
947 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
948   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
949   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
950   #endif
951 #endif
952   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
953 
954   PetscCall(PetscLogGpuTimeBegin());
955   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
956   if (transA) {
957     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
958     transA = false;
959   }
960 
961   if (transB) {
962     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
963     transB = false;
964   }
965 
966   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
967   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
968     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
969     calling new Mat_SeqAIJKokkos().
970     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
971   */
972   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
973 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
974   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
975   auto spgemmHandle = pdata->kh.get_spgemm_handle();
976   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
977 #endif
978   PetscCall(PetscLogGpuTimeEnd());
979 
980   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
981   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
982   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
983   PetscFunctionReturn(PETSC_SUCCESS);
984 }
985 
986 /* handles sparse matrix matrix ops */
987 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
988 {
989   Mat_Product *product = mat->product;
990   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
991 
992   PetscFunctionBegin;
993   MatCheckProduct(mat, 1);
994   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
995   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
996   if (Biskok && Ciskok) {
997     switch (product->type) {
998     case MATPRODUCT_AB:
999     case MATPRODUCT_AtB:
1000     case MATPRODUCT_ABt:
1001       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1002       break;
1003     case MATPRODUCT_PtAP:
1004     case MATPRODUCT_RARt:
1005     case MATPRODUCT_ABC:
1006       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1007       break;
1008     default:
1009       break;
1010     }
1011   } else { /* fallback for AIJ */
1012     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1013   }
1014   PetscFunctionReturn(PETSC_SUCCESS);
1015 }
1016 
1017 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1018 {
1019   Mat_SeqAIJKokkos *aijkok;
1020 
1021   PetscFunctionBegin;
1022   PetscCall(PetscLogGpuTimeBegin());
1023   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1024   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1025   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1026   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1027   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1028   PetscCall(PetscLogGpuTimeEnd());
1029   PetscFunctionReturn(PETSC_SUCCESS);
1030 }
1031 
1032 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1033 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1034 {
1035   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1036 
1037   PetscFunctionBegin;
1038   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1039     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1040 
1041     PetscCall(PetscLogGpuTimeBegin());
1042     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1043     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1044     const auto &Aa     = aijkok->a_dual.view_device();
1045     const auto &Adiag  = aijkok->diag_dual.view_device();
1046     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1047     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1048     PetscCall(PetscLogGpuFlops(n));
1049     PetscCall(PetscLogGpuTimeEnd());
1050   } else { // need reassembly, very slow!
1051     PetscCall(MatShift_Basic(A, a));
1052   }
1053   PetscFunctionReturn(PETSC_SUCCESS);
1054 }
1055 
1056 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1057 {
1058   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1059 
1060   PetscFunctionBegin;
1061   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1062     ConstPetscScalarKokkosView dv;
1063     PetscInt                   n, nv;
1064 
1065     PetscCall(PetscLogGpuTimeBegin());
1066     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1067     PetscCall(VecGetKokkosView(D, &dv));
1068     PetscCall(VecGetLocalSize(D, &nv));
1069     n = PetscMin(Y->rmap->n, Y->cmap->n);
1070     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1071 
1072     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1073     const auto &Aa     = aijkok->a_dual.view_device();
1074     const auto &Adiag  = aijkok->diag_dual.view_device();
1075     PetscCallCXX(Kokkos::parallel_for(
1076       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1077         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1078         else Aa(Adiag(i)) += dv(i);
1079       }));
1080     PetscCall(VecRestoreKokkosView(D, &dv));
1081     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1082     PetscCall(PetscLogGpuFlops(n));
1083     PetscCall(PetscLogGpuTimeEnd());
1084   } else { // need reassembly, very slow!
1085     PetscCall(MatDiagonalSet_Default(Y, D, is));
1086   }
1087   PetscFunctionReturn(PETSC_SUCCESS);
1088 }
1089 
1090 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1091 {
1092   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1093   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1094   ConstPetscScalarKokkosView lv, rv;
1095 
1096   PetscFunctionBegin;
1097   PetscCall(PetscLogGpuTimeBegin());
1098   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1099   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1100   const auto &Aa     = aijkok->a_dual.view_device();
1101   const auto &Ai     = aijkok->i_dual.view_device();
1102   const auto &Aj     = aijkok->j_dual.view_device();
1103   if (ll) {
1104     PetscCall(VecGetLocalSize(ll, &m));
1105     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1106     PetscCall(VecGetKokkosView(ll, &lv));
1107     PetscCallCXX(Kokkos::parallel_for( // for each row
1108       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1109         PetscInt i   = t.league_rank(); // row i
1110         PetscInt len = Ai(i + 1) - Ai(i);
1111         // scale entries on the row
1112         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1113       }));
1114     PetscCall(VecRestoreKokkosView(ll, &lv));
1115     PetscCall(PetscLogGpuFlops(nz));
1116   }
1117   if (rr) {
1118     PetscCall(VecGetLocalSize(rr, &n));
1119     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1120     PetscCall(VecGetKokkosView(rr, &rv));
1121     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1122       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1123     PetscCall(VecRestoreKokkosView(rr, &lv));
1124     PetscCall(PetscLogGpuFlops(nz));
1125   }
1126   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1127   PetscCall(PetscLogGpuTimeEnd());
1128   PetscFunctionReturn(PETSC_SUCCESS);
1129 }
1130 
1131 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1132 {
1133   Mat_SeqAIJKokkos *aijkok;
1134 
1135   PetscFunctionBegin;
1136   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1137   if (aijkok) { /* Only zero the device if data is already there */
1138     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1139     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1140   } else { /* Might be preallocated but not assembled */
1141     PetscCall(MatZeroEntries_SeqAIJ(A));
1142   }
1143   PetscFunctionReturn(PETSC_SUCCESS);
1144 }
1145 
1146 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1147 {
1148   Mat_SeqAIJKokkos     *aijkok;
1149   PetscInt              n;
1150   PetscScalarKokkosView xv;
1151 
1152   PetscFunctionBegin;
1153   PetscCall(VecGetLocalSize(x, &n));
1154   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1155   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1156 
1157   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1158   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1159 
1160   const auto &Aa    = aijkok->a_dual.view_device();
1161   const auto &Ai    = aijkok->i_dual.view_device();
1162   const auto &Adiag = aijkok->diag_dual.view_device();
1163 
1164   PetscCall(VecGetKokkosViewWrite(x, &xv));
1165   Kokkos::parallel_for(
1166     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1167       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1168       else xv(i) = 0;
1169     });
1170   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1171   PetscFunctionReturn(PETSC_SUCCESS);
1172 }
1173 
1174 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1175 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1176 {
1177   Mat_SeqAIJKokkos *aijkok;
1178 
1179   PetscFunctionBegin;
1180   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1181   PetscAssertPointer(kv, 2);
1182   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1183   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1184   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1185   *kv    = aijkok->a_dual.view_device();
1186   PetscFunctionReturn(PETSC_SUCCESS);
1187 }
1188 
1189 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1190 {
1191   PetscFunctionBegin;
1192   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1193   PetscAssertPointer(kv, 2);
1194   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1195   PetscFunctionReturn(PETSC_SUCCESS);
1196 }
1197 
1198 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1199 {
1200   Mat_SeqAIJKokkos *aijkok;
1201 
1202   PetscFunctionBegin;
1203   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1204   PetscAssertPointer(kv, 2);
1205   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1206   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1207   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1208   *kv    = aijkok->a_dual.view_device();
1209   PetscFunctionReturn(PETSC_SUCCESS);
1210 }
1211 
1212 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1213 {
1214   PetscFunctionBegin;
1215   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1216   PetscAssertPointer(kv, 2);
1217   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1218   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1219   PetscFunctionReturn(PETSC_SUCCESS);
1220 }
1221 
1222 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1223 {
1224   Mat_SeqAIJKokkos *aijkok;
1225 
1226   PetscFunctionBegin;
1227   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1228   PetscAssertPointer(kv, 2);
1229   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1230   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1231   *kv    = aijkok->a_dual.view_device();
1232   PetscFunctionReturn(PETSC_SUCCESS);
1233 }
1234 
1235 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1236 {
1237   PetscFunctionBegin;
1238   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1239   PetscAssertPointer(kv, 2);
1240   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1241   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1242   PetscFunctionReturn(PETSC_SUCCESS);
1243 }
1244 
1245 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1246 {
1247   Mat_SeqAIJKokkos *akok;
1248 
1249   PetscFunctionBegin;
1250   auto exec = PetscGetKokkosExecutionSpace();
1251   // Create host copies of the input aij
1252   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1253   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1254   // Don't copy the vals to the host now
1255   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1256 
1257   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1258   // Note we have modified device data so it will copy lazily
1259   a_dual.modify_device();
1260   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1261   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1262 
1263   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1264   PetscCall(MatCreate(comm, A));
1265   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1266   PetscFunctionReturn(PETSC_SUCCESS);
1267 }
1268 
1269 /* Computes Y += alpha X */
1270 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1271 {
1272   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1273   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1274   ConstMatScalarKokkosView Xa;
1275   MatScalarKokkosView      Ya;
1276   auto                     exec = PetscGetKokkosExecutionSpace();
1277 
1278   PetscFunctionBegin;
1279   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1280   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1281   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1282   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1283   PetscCall(PetscLogGpuTimeBegin());
1284 
1285   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1286     PetscBool e;
1287     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1288     if (e) {
1289       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1290       if (e) pattern = SAME_NONZERO_PATTERN;
1291     }
1292   }
1293 
1294   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1295     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1296     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1297     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1298   */
1299   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1300   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1301   Xa   = xkok->a_dual.view_device();
1302   Ya   = ykok->a_dual.view_device();
1303 
1304   if (pattern == SAME_NONZERO_PATTERN) {
1305     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1306     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1307   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1308     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1309     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1310 
1311     Kokkos::parallel_for(
1312       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1313         PetscInt i = t.league_rank(); // row i
1314         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1315           // Only one thread works in a team
1316           PetscInt p, q = Yi(i);
1317           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1318             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1319             if (Xj(p) == Yj(q)) {                        // Found it
1320               Ya(q) += alpha * Xa(p);
1321               q++;
1322             } else {
1323             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1324             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1325 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1326               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1327 #else
1328               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1329 #endif
1330             }
1331           }
1332         });
1333       });
1334     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1335   } else { // different nonzero patterns
1336     Mat             Z;
1337     KokkosCsrMatrix zcsr;
1338     KernelHandle    kh;
1339     kh.create_spadd_handle(true); // X, Y are sorted
1340     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1341     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1342     zkok = new Mat_SeqAIJKokkos(zcsr);
1343     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1344     PetscCall(MatHeaderReplace(Y, &Z));
1345     kh.destroy_spadd_handle();
1346   }
1347   PetscCall(PetscLogGpuTimeEnd());
1348   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1349   PetscFunctionReturn(PETSC_SUCCESS);
1350 }
1351 
1352 struct MatCOOStruct_SeqAIJKokkos {
1353   PetscCount           n;
1354   PetscCount           Atot;
1355   PetscInt             nz;
1356   PetscCountKokkosView jmap;
1357   PetscCountKokkosView perm;
1358 
1359   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1360   {
1361     nz   = coo_h->nz;
1362     n    = coo_h->n;
1363     Atot = coo_h->Atot;
1364     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1365     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1366   }
1367 };
1368 
1369 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1370 {
1371   PetscFunctionBegin;
1372   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1373   PetscFunctionReturn(PETSC_SUCCESS);
1374 }
1375 
1376 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1377 {
1378   Mat_SeqAIJKokkos          *akok;
1379   Mat_SeqAIJ                *aseq;
1380   PetscContainer             container_h;
1381   MatCOOStruct_SeqAIJ       *coo_h;
1382   MatCOOStruct_SeqAIJKokkos *coo_d;
1383 
1384   PetscFunctionBegin;
1385   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1386   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1387   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1388   delete akok;
1389   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1390   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1391 
1392   // Copy the COO struct to device
1393   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1394   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1395   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1396 
1397   // Put the COO struct in a container and then attach that to the matrix
1398   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1399   PetscFunctionReturn(PETSC_SUCCESS);
1400 }
1401 
1402 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1403 {
1404   MatScalarKokkosView        Aa;
1405   ConstMatScalarKokkosView   kv;
1406   PetscMemType               memtype;
1407   PetscContainer             container;
1408   MatCOOStruct_SeqAIJKokkos *coo;
1409 
1410   PetscFunctionBegin;
1411   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1412   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1413 
1414   const auto &n    = coo->n;
1415   const auto &Annz = coo->nz;
1416   const auto &jmap = coo->jmap;
1417   const auto &perm = coo->perm;
1418 
1419   PetscCall(PetscGetMemType(v, &memtype));
1420   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1421     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1422   } else {
1423     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1424   }
1425 
1426   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1427   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1428 
1429   PetscCall(PetscLogGpuTimeBegin());
1430   Kokkos::parallel_for(
1431     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1432       PetscScalar sum = 0.0;
1433       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1434       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1435     });
1436   PetscCall(PetscLogGpuTimeEnd());
1437 
1438   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1439   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1440   PetscFunctionReturn(PETSC_SUCCESS);
1441 }
1442 
1443 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1444 {
1445   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1446 
1447   PetscFunctionBegin;
1448   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1449   A->boundtocpu  = PETSC_FALSE;
1450 
1451   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1452   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1453   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1454   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1455   A->ops->scale                     = MatScale_SeqAIJKokkos;
1456   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1457   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1458   A->ops->mult                      = MatMult_SeqAIJKokkos;
1459   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1460   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1461   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1462   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1463   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1464   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1465   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1466   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1467   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1468   A->ops->shift                     = MatShift_SeqAIJKokkos;
1469   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1470   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1471   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1472   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1473   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1474   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1475   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1476   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1477   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1478 
1479   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1480   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1481 #if defined(PETSC_HAVE_HYPRE)
1482   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1483 #endif
1484   PetscFunctionReturn(PETSC_SUCCESS);
1485 }
1486 
1487 /*
1488    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1489 
1490   Input Parameters:
1491 +  A       - the MATSEQAIJKOKKOS matrix
1492 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1493 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1494 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1495 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1496 
1497   Output Parameter:
1498 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1499 */
1500 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1501 {
1502   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1503   PetscInt          nblocks = bs.extent(0) - 1;
1504 
1505   PetscFunctionBegin;
1506   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1507 
1508   // Pull out the diagonal blocks of the matrix and then invert the blocks
1509   auto Aa    = akok->a_dual.view_device();
1510   auto Ai    = akok->i_dual.view_device();
1511   auto Aj    = akok->j_dual.view_device();
1512   auto Adiag = akok->diag_dual.view_device();
1513   // TODO: how to tune the team size?
1514 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1515   auto ts = Kokkos::AUTO();
1516 #else
1517   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1518 #endif
1519   PetscCallCXX(Kokkos::parallel_for(
1520     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1521       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1522       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1523       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1524       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1525       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1526 
1527       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1528         PetscInt i = rstart + r;                                                            // i-th row in A
1529 
1530         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1531           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1532 
1533           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1534             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1535               B(r, c) = 0.0;
1536             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1537               B(r, c) = Aa(first + c);
1538             } else { // this entry does not show up in the CSR
1539               B(r, c) = 0.0;
1540             }
1541           }
1542         } else { // rare case that the diagonal does not exist
1543           const PetscInt begin = Ai(i);
1544           const PetscInt end   = Ai(i + 1);
1545           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1546           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1547             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1548             else if (Aj(j) >= rstart + m) break;
1549           }
1550         }
1551       });
1552 
1553       // LU-decompose B (w/o pivoting) and then invert B
1554       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1555       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1556     }));
1557   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1558   PetscFunctionReturn(PETSC_SUCCESS);
1559 }
1560 
1561 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1562 {
1563   Mat_SeqAIJ *aseq;
1564   PetscInt    i, m, n;
1565   auto        exec = PetscGetKokkosExecutionSpace();
1566 
1567   PetscFunctionBegin;
1568   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1569 
1570   m = akok->nrows();
1571   n = akok->ncols();
1572   PetscCall(MatSetSizes(A, m, n, m, n));
1573   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1574 
1575   /* Set up data structures of A as a MATSEQAIJ */
1576   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1577   aseq = (Mat_SeqAIJ *)A->data;
1578 
1579   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1580   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1581 
1582   aseq->i       = akok->i_host_data();
1583   aseq->j       = akok->j_host_data();
1584   aseq->a       = akok->a_host_data();
1585   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1586   aseq->free_a  = PETSC_FALSE;
1587   aseq->free_ij = PETSC_FALSE;
1588   aseq->nz      = akok->nnz();
1589   aseq->maxnz   = aseq->nz;
1590 
1591   PetscCall(PetscMalloc1(m, &aseq->imax));
1592   PetscCall(PetscMalloc1(m, &aseq->ilen));
1593   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1594 
1595   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1596   akok->nonzerostate = A->nonzerostate;
1597   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1598   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1599   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1600   PetscFunctionReturn(PETSC_SUCCESS);
1601 }
1602 
1603 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1604 {
1605   PetscFunctionBegin;
1606   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1607   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1608   PetscFunctionReturn(PETSC_SUCCESS);
1609 }
1610 
1611 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1612 {
1613   Mat_SeqAIJKokkos *akok;
1614 
1615   PetscFunctionBegin;
1616   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1617   PetscCall(MatCreate(comm, A));
1618   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1619   PetscFunctionReturn(PETSC_SUCCESS);
1620 }
1621 
1622 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1623 
1624    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1625  */
1626 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1627 {
1628   PetscFunctionBegin;
1629   PetscCall(MatCreate(comm, A));
1630   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1631   PetscFunctionReturn(PETSC_SUCCESS);
1632 }
1633 
1634 /*@C
1635   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1636   (the default parallel PETSc format). This matrix will ultimately be handled by
1637   Kokkos for calculations.
1638 
1639   Collective
1640 
1641   Input Parameters:
1642 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1643 . m    - number of rows
1644 . n    - number of columns
1645 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1646 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1647 
1648   Output Parameter:
1649 . A - the matrix
1650 
1651   Level: intermediate
1652 
1653   Notes:
1654   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1655   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1656   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1657 
1658   The AIJ format, also called
1659   compressed row storage, is fully compatible with standard Fortran
1660   storage.  That is, the stored row and column indices can begin at
1661   either one (as in Fortran) or zero.
1662 
1663   Specify the preallocated storage with either `nz` or `nnz` (not both).
1664   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1665   allocation.
1666 
1667 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1668 @*/
1669 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1670 {
1671   PetscFunctionBegin;
1672   PetscCall(PetscKokkosInitializeCheck());
1673   PetscCall(MatCreate(comm, A));
1674   PetscCall(MatSetSizes(*A, m, n, m, n));
1675   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1676   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1677   PetscFunctionReturn(PETSC_SUCCESS);
1678 }
1679 
1680 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1681 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1682 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1683 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1684 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1685 {
1686   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1687   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1688   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1689 
1690   PetscFunctionBegin;
1691   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1692     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1693     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1694     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1695   }
1696   PetscFunctionReturn(PETSC_SUCCESS);
1697 }
1698 
1699 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1700 {
1701   const PetscInt              n         = A->rmap->n;
1702   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1703   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1704   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1705 
1706   PetscFunctionBegin;
1707   if (!factors->transpose_updated) {
1708     if (has_upper) {
1709       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1710         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1711         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1712         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1713       }
1714 
1715       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1716         if (!factors->U) {
1717           Mat_SeqAIJ *seq;
1718 
1719           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1720           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1721 
1722           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1723           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1724           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1725           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1726         } else {
1727           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1728         }
1729         // Copy Ut from host to device
1730         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1731         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1732         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1733       } else { // If U was computed on device, we also compute the transpose there
1734         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1735         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1736                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1737                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1738                                                                                                                                                                                                                                factors->aUt_d));
1739         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1740       }
1741       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1742     }
1743 
1744     // do the same for L with LU
1745     if (has_lower) {
1746       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1747         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1748         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1749         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1750       }
1751 
1752       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1753         if (!factors->L) {
1754           Mat_SeqAIJ *seq;
1755 
1756           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1757           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1758 
1759           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1760           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1761           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1762           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1763         } else {
1764           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1765         }
1766         // Copy Lt from host to device
1767         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1768         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1769         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1770       } else { // If L was computed on device, we also compute the transpose there
1771         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1772         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1773                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1774                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1775                                                                                                                                                                                                                                factors->aLt_d));
1776         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1777       }
1778       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1779     }
1780 
1781     factors->transpose_updated = PETSC_TRUE;
1782   }
1783   PetscFunctionReturn(PETSC_SUCCESS);
1784 }
1785 
1786 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1787 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1788 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1789 {
1790   auto                        exec    = PetscGetKokkosExecutionSpace();
1791   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1792   PetscInt                    m       = A->rmap->n;
1793   PetscScalarKokkosView       D       = factors->D_d;
1794   PetscScalarKokkosView       X, Y, B; // alias
1795   ConstPetscScalarKokkosView  b;
1796   PetscScalarKokkosView       x;
1797   PetscIntKokkosView         &rowperm  = factors->rowperm;
1798   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1799 
1800   PetscFunctionBegin;
1801   PetscCall(PetscLogGpuTimeBegin());
1802   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1803   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1804   PetscCall(VecGetKokkosView(bb, &b));
1805   PetscCall(VecGetKokkosViewWrite(xx, &x));
1806 
1807   // Solve U^T Y = B
1808   if (identity) { // Reorder b with the row permutation
1809     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1810     Y = factors->workVector;
1811   } else {
1812     B = factors->workVector;
1813     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1814     Y = x;
1815   }
1816   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1817 
1818   // Solve diag(D) Y' = Y.
1819   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1820   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1821 
1822   // Solve UX = Y
1823   if (identity) {
1824     X = x;
1825   } else {
1826     X = factors->workVector; // B is not needed anymore
1827   }
1828   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1829 
1830   // Reorder X with the inverse column (row) permutation
1831   if (!identity) {
1832     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1833   }
1834 
1835   PetscCall(VecRestoreKokkosView(bb, &b));
1836   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1837   PetscCall(PetscLogGpuTimeEnd());
1838   PetscFunctionReturn(PETSC_SUCCESS);
1839 }
1840 
1841 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1842 // R and C are represented by rowperm and colperm in factors.
1843 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1844 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1845 {
1846   auto                        exec    = PetscGetKokkosExecutionSpace();
1847   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1848   PetscInt                    m       = A->rmap->n;
1849   PetscScalarKokkosView       X, Y, B; // alias
1850   ConstPetscScalarKokkosView  b;
1851   PetscScalarKokkosView       x;
1852   PetscIntKokkosView         &rowperm      = factors->rowperm;
1853   PetscIntKokkosView         &colperm      = factors->colperm;
1854   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1855   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1856 
1857   PetscFunctionBegin;
1858   PetscCall(PetscLogGpuTimeBegin());
1859   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1860   PetscCall(VecGetKokkosView(bb, &b));
1861   PetscCall(VecGetKokkosViewWrite(xx, &x));
1862 
1863   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1864   if (row_identity) {
1865     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1866     Y = factors->workVector;
1867   } else {
1868     B = factors->workVector;
1869     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1870     Y = x;
1871   }
1872   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1873 
1874   // Solve U C^- x = Y
1875   if (col_identity) {
1876     X = x;
1877   } else {
1878     X = factors->workVector;
1879   }
1880   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1881 
1882   // x = C X; Reorder X with the inverse col permutation
1883   if (!col_identity) {
1884     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1885   }
1886 
1887   PetscCall(VecRestoreKokkosView(bb, &b));
1888   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1889   PetscCall(PetscLogGpuTimeEnd());
1890   PetscFunctionReturn(PETSC_SUCCESS);
1891 }
1892 
1893 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1894 // R and C are represented by rowperm and colperm in factors.
1895 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1896 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1897 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1898 {
1899   auto                        exec    = PetscGetKokkosExecutionSpace();
1900   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1901   PetscInt                    m       = A->rmap->n;
1902   PetscScalarKokkosView       X, Y, B; // alias
1903   ConstPetscScalarKokkosView  b;
1904   PetscScalarKokkosView       x;
1905   PetscIntKokkosView         &rowperm      = factors->rowperm;
1906   PetscIntKokkosView         &colperm      = factors->colperm;
1907   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1908   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1909 
1910   PetscFunctionBegin;
1911   PetscCall(PetscLogGpuTimeBegin());
1912   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1913   PetscCall(VecGetKokkosView(bb, &b));
1914   PetscCall(VecGetKokkosViewWrite(xx, &x));
1915 
1916   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1917   if (col_identity) { // Reorder b with the col permutation
1918     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1919     Y = factors->workVector;
1920   } else {
1921     B = factors->workVector;
1922     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1923     Y = x;
1924   }
1925   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1926 
1927   // Solve L^T X = Y
1928   if (row_identity) {
1929     X = x;
1930   } else {
1931     X = factors->workVector;
1932   }
1933   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1934 
1935   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1936   if (!row_identity) {
1937     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1938   }
1939 
1940   PetscCall(VecRestoreKokkosView(bb, &b));
1941   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1942   PetscCall(PetscLogGpuTimeEnd());
1943   PetscFunctionReturn(PETSC_SUCCESS);
1944 }
1945 
1946 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1947 {
1948   PetscFunctionBegin;
1949   PetscCall(MatSeqAIJKokkosSyncHost(A));
1950   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1951 
1952   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1953     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1954     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1955     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1956     const MatScalar            *Ba = b->a;
1957     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1958 
1959     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1960       // Allocate memory and copy the structure
1961       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1962       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1963       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1964       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1965       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1966       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1967 
1968       PetscInt *Li = factors->iL_h.data();
1969       PetscInt *Lj = factors->jL_h.data();
1970       PetscInt *Ui = factors->iU_h.data();
1971       PetscInt *Uj = factors->jU_h.data();
1972 
1973       Li[0] = Ui[0] = 0;
1974       for (PetscInt i = 0; i < m; i++) {
1975         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1976         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1977 
1978         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1979         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1980 
1981         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1982         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1983 
1984         Li[i + 1] = Li[i] + llen + 1;
1985         Ui[i + 1] = Ui[i] + ulen;
1986       }
1987 
1988       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1989       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1990       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1991       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1992       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1993       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1994 
1995       // Copy row/col permutation to device
1996       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1997       PetscBool row_identity;
1998       PetscCall(ISIdentity(rowperm, &row_identity));
1999       if (!row_identity) {
2000         const PetscInt *ip;
2001 
2002         PetscCall(ISGetIndices(rowperm, &ip));
2003         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
2004         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2005         PetscCall(ISRestoreIndices(rowperm, &ip));
2006         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2007       }
2008 
2009       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
2010       PetscBool col_identity;
2011       PetscCall(ISIdentity(colperm, &col_identity));
2012       if (!col_identity) {
2013         const PetscInt *ip;
2014 
2015         PetscCall(ISGetIndices(colperm, &ip));
2016         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2017         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2018         PetscCall(ISRestoreIndices(colperm, &ip));
2019         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2020       }
2021 
2022       /* Create sptrsv handles for L, U and their transpose */
2023 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2024       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2025 #else
2026       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2027 #endif
2028       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2029       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2030       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2031       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2032     }
2033 
2034     // Copy the value
2035     for (PetscInt i = 0; i < m; i++) {
2036       PetscInt        llen = Bi[i + 1] - Bi[i];
2037       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2038       const PetscInt *Li   = factors->iL_h.data();
2039       const PetscInt *Ui   = factors->iU_h.data();
2040 
2041       PetscScalar *La = factors->aL_h.data();
2042       PetscScalar *Ua = factors->aU_h.data();
2043 
2044       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2045       La[Li[i] + llen] = 1.0;                      // diagonal entry
2046 
2047       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2048       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2049     }
2050 
2051     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2052     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2053     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2054     factors->transpose_updated         = PETSC_FALSE;
2055     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2056 
2057     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2058     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2059   }
2060 
2061   B->ops->matsolve          = NULL;
2062   B->ops->matsolvetranspose = NULL;
2063   PetscFunctionReturn(PETSC_SUCCESS);
2064 }
2065 
2066 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2067 {
2068   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2069   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2070   PetscInt                    fill_lev = info->levels;
2071 
2072   PetscFunctionBegin;
2073   PetscCall(PetscLogGpuTimeBegin());
2074   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2075   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2076 
2077   auto a_d = aijkok->a_dual.view_device();
2078   auto i_d = aijkok->i_dual.view_device();
2079   auto j_d = aijkok->j_dual.view_device();
2080 
2081   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2082 
2083   B->assembled              = PETSC_TRUE;
2084   B->preallocated           = PETSC_TRUE;
2085   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2086   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2087   B->ops->matsolve          = NULL;
2088   B->ops->matsolvetranspose = NULL;
2089 
2090   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2091   factors->transpose_updated         = PETSC_FALSE;
2092   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2093   /* TODO: log flops, but how to know that? */
2094   PetscCall(PetscLogGpuTimeEnd());
2095   PetscFunctionReturn(PETSC_SUCCESS);
2096 }
2097 
2098 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2099 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2100 {
2101   Mat_SeqAIJKokkos           *aijkok;
2102   Mat_SeqAIJ                 *b;
2103   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2104   PetscInt                    fill_lev = info->levels;
2105   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2106   PetscInt                    n        = A->rmap->n;
2107 
2108   PetscFunctionBegin;
2109   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2110   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2111 
2112   /* Create a spiluk handle and then do symbolic factorization */
2113   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2114   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2115 
2116   auto spiluk_handle = factors->kh.get_spiluk_handle();
2117 
2118   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2119   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2120   Kokkos::realloc(factors->iU_d, n + 1);
2121   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2122 
2123   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2124   auto i_d = aijkok->i_dual.view_device();
2125   auto j_d = aijkok->j_dual.view_device();
2126   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2127   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2128 
2129   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2130   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2131   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2132   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2133 
2134   /* TODO: add options to select sptrsv algorithms */
2135   /* Create sptrsv handles for L, U and their transpose */
2136 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2137   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2138 #else
2139   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2140 #endif
2141 
2142   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2143   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2144   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2145   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2146 
2147   /* Fill fields of the factor matrix B */
2148   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2149   b     = (Mat_SeqAIJ *)B->data;
2150   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2151   B->info.fill_ratio_given  = info->fill;
2152   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2153 
2154   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2155   PetscFunctionReturn(PETSC_SUCCESS);
2156 }
2157 
2158 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2159 {
2160   PetscFunctionBegin;
2161   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2162   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2163   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2164   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2165   PetscFunctionReturn(PETSC_SUCCESS);
2166 }
2167 
2168 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2169 {
2170   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2171 
2172   PetscFunctionBegin;
2173   if (!info->factoronhost) {
2174     PetscCall(ISIdentity(isrow, &row_identity));
2175     PetscCall(ISIdentity(iscol, &col_identity));
2176   }
2177 
2178   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2179   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2180 
2181   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2182     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2183   } else {
2184     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2185     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2186   }
2187   PetscFunctionReturn(PETSC_SUCCESS);
2188 }
2189 
2190 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2191 {
2192   PetscFunctionBegin;
2193   PetscCall(MatSeqAIJKokkosSyncHost(A));
2194   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2195 
2196   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2197     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2198     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2199     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2200     const MatScalar            *Ba = b->a;
2201     PetscInt                    m  = B->rmap->n;
2202 
2203     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2204       // Allocate memory and copy the structure
2205       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2206       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2207       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2208       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2209       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2210       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2211 
2212       // Build jU_h from the skewed Aj
2213       PetscInt *Uj = factors->jU_h.data();
2214       for (PetscInt i = 0; i < m; i++) {
2215         PetscInt ulen = Bi[i + 1] - Bi[i];
2216         Uj[Bi[i]]     = i;                                              // diagonal entry
2217         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2218       }
2219 
2220       // Copy iU, jU to device
2221       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2222       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2223 
2224       // Copy row/col permutation to device
2225       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2226       PetscBool row_identity;
2227       PetscCall(ISIdentity(rowperm, &row_identity));
2228       if (!row_identity) {
2229         const PetscInt *ip;
2230 
2231         PetscCall(ISGetIndices(rowperm, &ip));
2232         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2233         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2234         PetscCall(ISRestoreIndices(rowperm, &ip));
2235         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2236       }
2237 
2238       // Create sptrsv handles for U and U^T
2239 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2240       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2241 #else
2242       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2243 #endif
2244       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2245       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2246     }
2247     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2248     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2249     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2250 
2251     // Copy the value
2252     PetscScalar *Ua = factors->aU_h.data();
2253     PetscScalar *D  = factors->D_h.data();
2254     for (PetscInt i = 0; i < m; i++) {
2255       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2256       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2257       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2258     }
2259     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2260     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2261 
2262     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2263     factors->transpose_updated         = PETSC_FALSE;
2264   }
2265 
2266   B->ops->matsolve          = NULL;
2267   B->ops->matsolvetranspose = NULL;
2268   PetscFunctionReturn(PETSC_SUCCESS);
2269 }
2270 
2271 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2272 {
2273   PetscFunctionBegin;
2274   if (info->solveonhost) {
2275     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2276     PetscCall(MatSetType(B, MATSEQSBAIJ));
2277     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2278   }
2279 
2280   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2281 
2282   if (!info->solveonhost) {
2283     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2284     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2285     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2286     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2287   }
2288   PetscFunctionReturn(PETSC_SUCCESS);
2289 }
2290 
2291 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2292 {
2293   PetscFunctionBegin;
2294   if (info->solveonhost) {
2295     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2296     PetscCall(MatSetType(B, MATSEQSBAIJ));
2297     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2298   }
2299 
2300   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2301 
2302   if (!info->solveonhost) {
2303     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2304     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2305     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2306     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2307   }
2308   PetscFunctionReturn(PETSC_SUCCESS);
2309 }
2310 
2311 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2312 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2313 {
2314   PetscFunctionBegin;
2315   *type = MATSOLVERKOKKOS;
2316   PetscFunctionReturn(PETSC_SUCCESS);
2317 }
2318 
2319 /*MC
2320   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2321   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2322 
2323   Level: beginner
2324 
2325 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2326 M*/
2327 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2328 {
2329   PetscInt n = A->rmap->n;
2330   MPI_Comm comm;
2331 
2332   PetscFunctionBegin;
2333   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2334   PetscCall(MatCreate(comm, B));
2335   PetscCall(MatSetSizes(*B, n, n, n, n));
2336   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2337   (*B)->factortype = ftype;
2338   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2339   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2340   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2341 
2342   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2343     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2344     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2345     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2346     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2347     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2348   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2349     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2350     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2351     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2352     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2353   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2354 
2355   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2356   (*B)->canuseordering = PETSC_TRUE;
2357   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2358   PetscFunctionReturn(PETSC_SUCCESS);
2359 }
2360 
2361 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2362 {
2363   PetscFunctionBegin;
2364   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2365   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2366   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2367   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2368   PetscFunctionReturn(PETSC_SUCCESS);
2369 }
2370 
2371 /* Utility to print out a KokkosCsrMatrix for debugging */
2372 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2373 {
2374   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2375   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2376   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2377   const PetscInt    *i  = iv.data();
2378   const PetscInt    *j  = jv.data();
2379   const PetscScalar *a  = av.data();
2380   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2381 
2382   PetscFunctionBegin;
2383   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2384   for (PetscInt k = 0; k < m; k++) {
2385     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2386     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2387     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2388   }
2389   PetscFunctionReturn(PETSC_SUCCESS);
2390 }
2391