xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 60ace51db09f3c7b46f7cbf9cebe599fd6de82d8)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20 #include <KokkosSparse_spmv.hpp>
21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22 
23 #include <KokkosSparse_spiluk.hpp>
24 #include <KokkosSparse_sptrsv.hpp>
25 #include <KokkosSparse_spgemm.hpp>
26 #include <KokkosSparse_spadd.hpp>
27 #include <KokkosBatched_LU_Decl.hpp>
28 #include <KokkosBatched_InverseLU_Decl.hpp>
29 
30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31 
32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33   #include <KokkosSparse_Utils.hpp>
34 using KokkosSparse::sort_crs_matrix;
35 using KokkosSparse::Impl::transpose_matrix;
36 #else
37   #include <KokkosKernels_Sorting.hpp>
38 using KokkosKernels::sort_crs_matrix;
39 using KokkosKernels::Impl::transpose_matrix;
40 #endif
41 
42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43 using KokkosSparse::spiluk_symbolic;
44 using KokkosSparse::spiluk_numeric;
45 using KokkosSparse::sptrsv_symbolic;
46 using KokkosSparse::sptrsv_solve;
47 using KokkosSparse::Experimental::SPTRSVAlgorithm;
48 using KokkosSparse::Experimental::SPILUKAlgorithm;
49 #else
50 using KokkosSparse::Experimental::spiluk_symbolic;
51 using KokkosSparse::Experimental::spiluk_numeric;
52 using KokkosSparse::Experimental::sptrsv_symbolic;
53 using KokkosSparse::Experimental::sptrsv_solve;
54 using KokkosSparse::Experimental::SPTRSVAlgorithm;
55 using KokkosSparse::Experimental::SPILUKAlgorithm;
56 #endif
57 
58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59 
60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62    In the latter case, it is important to set a_dual's sync state correctly.
63  */
64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65 {
66   Mat_SeqAIJ       *aijseq;
67   Mat_SeqAIJKokkos *aijkok;
68 
69   PetscFunctionBegin;
70   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72 
73   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75 
76   /* If aijkok does not exist, we just copy i, j to device.
77      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78      In both cases, we build a new aijkok structure.
79   */
80   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidently freeing much needed a,i,j on host when deleting aijkok */
82       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
83       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
84       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
85       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
86       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
87       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
88       aijseq->free_a  = PETSC_TRUE;
89       aijseq->free_ij = PETSC_TRUE;
90       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
91          we have the CsrMatrix on device first and then copy to host, followed by
92          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
93          One could improve it by not using NULL garray.
94       */
95     }
96     delete aijkok;
97     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
98     A->spptr = aijkok;
99   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
100     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
101     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
102     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
103   }
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 
107 /* Sync CSR data to device if not yet */
108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111 
112   PetscFunctionBegin;
113   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
114   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
115   if (aijkok->a_dual.need_sync_device()) {
116     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
117     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
118     aijkok->hermitian_updated = PETSC_FALSE;
119   }
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 /* Mark the CSR data on device as modified */
124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127 
128   PetscFunctionBegin;
129   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
130   aijkok->a_dual.clear_sync_state();
131   aijkok->a_dual.modify_device();
132   aijkok->transpose_updated = PETSC_FALSE;
133   aijkok->hermitian_updated = PETSC_FALSE;
134   PetscCall(MatSeqAIJInvalidateDiagonal(A));
135   PetscCall(PetscObjectStateIncrease((PetscObject)A));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140 {
141   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142   auto              exec   = PetscGetKokkosExecutionSpace();
143 
144   PetscFunctionBegin;
145   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
146   /* We do not expect one needs factors on host  */
147   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
148   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
150   PetscFunctionReturn(PETSC_SUCCESS);
151 }
152 
153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154 {
155   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
156 
157   PetscFunctionBegin;
158   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
159     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
160     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
161     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
162   */
163   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
165     *array = aijkok->a_dual.view_host().data();
166   } else { /* Happens when calling MatSetValues on a newly created matrix */
167     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
168   }
169   PetscFunctionReturn(PETSC_SUCCESS);
170 }
171 
172 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
173 {
174   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
175 
176   PetscFunctionBegin;
177   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
178   PetscFunctionReturn(PETSC_SUCCESS);
179 }
180 
181 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
182 {
183   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
184 
185   PetscFunctionBegin;
186   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
187     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
188     *array = aijkok->a_dual.view_host().data();
189   } else {
190     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
191   }
192   PetscFunctionReturn(PETSC_SUCCESS);
193 }
194 
195 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
196 {
197   PetscFunctionBegin;
198   *array = NULL;
199   PetscFunctionReturn(PETSC_SUCCESS);
200 }
201 
202 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
203 {
204   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
205 
206   PetscFunctionBegin;
207   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
208     *array = aijkok->a_dual.view_host().data();
209   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
210     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
211   }
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 
215 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
216 {
217   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
218 
219   PetscFunctionBegin;
220   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
221     aijkok->a_dual.clear_sync_state();
222     aijkok->a_dual.modify_host();
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
228 {
229   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
230 
231   PetscFunctionBegin;
232   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
233 
234   if (i) *i = aijkok->i_device_data();
235   if (j) *j = aijkok->j_device_data();
236   if (a) {
237     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
238     *a = aijkok->a_device_data();
239   }
240   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243 
244 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
245 {
246   PetscFunctionBegin;
247   *mtype = PETSC_MEMTYPE_KOKKOS;
248   PetscFunctionReturn(PETSC_SUCCESS);
249 }
250 
251 /*
252   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
253 
254   Input Parameter:
255 .  A       - the MATSEQAIJKOKKOS matrix
256 
257   Output Parameters:
258 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
259 -  T_d    - the transpose on device, whose value array is allocated but not initialized
260 */
261 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
262 {
263   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
264   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
265   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
266   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
267   MatRowMapType          *Ti = Ti_h.data();
268   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
269   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
270   PetscInt               *Tj   = Tj_h.data();
271   PetscInt               *perm = perm_h.data();
272   PetscInt               *offset;
273 
274   PetscFunctionBegin;
275   // Populate Ti
276   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
277   Ti++;
278   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
279   Ti--;
280   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
281 
282   // Populate Tj and the permutation array
283   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
284   for (PetscInt i = 0; i < m; i++) {
285     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
286       PetscInt r    = Aj[j];                       // row r of T
287       PetscInt disp = Ti[r] + offset[r];
288 
289       Tj[disp]   = i; // col i of T
290       perm[disp] = j;
291       offset[r]++;
292     }
293   }
294   PetscCall(PetscFree(offset));
295 
296   // Sort each row of T, along with the permutation array
297   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
298 
299   // Output perm and T on device
300   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
301   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
302   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
303   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
304   PetscFunctionReturn(PETSC_SUCCESS);
305 }
306 
307 // Generate the transpose on device and cache it internally
308 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
309 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
310 {
311   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
312   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
313   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
314   KokkosCsrMatrix  &T = akok->csrmatT;
315 
316   PetscFunctionBegin;
317   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
318   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
319 
320   const auto &Aa = akok->a_dual.view_device();
321 
322   if (A->symmetric == PETSC_BOOL3_TRUE) {
323     *csrmatT = akok->csrmat;
324   } else {
325     // See if we already have a cached transpose and its value is up to date
326     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
327       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
328         const auto &perm = akok->transpose_perm; // get the permutation array
329         auto       &Ta   = T.values;
330 
331         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
332       }
333     } else { // Generate T of size n x m for the first time
334       MatRowMapKokkosView perm;
335 
336       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
337       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
338       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
339     }
340     akok->transpose_updated = PETSC_TRUE;
341     *csrmatT                = akok->csrmatT;
342   }
343   PetscFunctionReturn(PETSC_SUCCESS);
344 }
345 
346 // Generate the Hermitian on device and cache it internally
347 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
348 {
349   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
350   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
351   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
352   KokkosCsrMatrix  &T = akok->csrmatH;
353 
354   PetscFunctionBegin;
355   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
356   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
357 
358   const auto &Aa = akok->a_dual.view_device();
359 
360   if (A->hermitian == PETSC_BOOL3_TRUE) {
361     *csrmatH = akok->csrmat;
362   } else {
363     // See if we already have a cached hermitian and its value is up to date
364     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
365       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
366         const auto &perm = akok->transpose_perm; // get the permutation array
367         auto       &Ta   = T.values;
368 
369         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
370       }
371     } else { // Generate T of size n x m for the first time
372       MatRowMapKokkosView perm;
373 
374       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
375       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
376       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
377     }
378     akok->hermitian_updated = PETSC_TRUE;
379     *csrmatH                = akok->csrmatH;
380   }
381   PetscFunctionReturn(PETSC_SUCCESS);
382 }
383 
384 /* y = A x */
385 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
386 {
387   Mat_SeqAIJKokkos          *aijkok;
388   ConstPetscScalarKokkosView xv;
389   PetscScalarKokkosView      yv;
390 
391   PetscFunctionBegin;
392   PetscCall(PetscLogGpuTimeBegin());
393   PetscCall(MatSeqAIJKokkosSyncDevice(A));
394   PetscCall(VecGetKokkosView(xx, &xv));
395   PetscCall(VecGetKokkosViewWrite(yy, &yv));
396   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
397   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
398   PetscCall(VecRestoreKokkosView(xx, &xv));
399   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
400   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
401   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
402   PetscCall(PetscLogGpuTimeEnd());
403   PetscFunctionReturn(PETSC_SUCCESS);
404 }
405 
406 /* y = A^T x */
407 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
408 {
409   Mat_SeqAIJKokkos          *aijkok;
410   const char                *mode;
411   ConstPetscScalarKokkosView xv;
412   PetscScalarKokkosView      yv;
413   KokkosCsrMatrix            csrmat;
414 
415   PetscFunctionBegin;
416   PetscCall(PetscLogGpuTimeBegin());
417   PetscCall(MatSeqAIJKokkosSyncDevice(A));
418   PetscCall(VecGetKokkosView(xx, &xv));
419   PetscCall(VecGetKokkosViewWrite(yy, &yv));
420   if (A->form_explicit_transpose) {
421     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
422     mode = "N";
423   } else {
424     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
425     csrmat = aijkok->csrmat;
426     mode   = "T";
427   }
428   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
429   PetscCall(VecRestoreKokkosView(xx, &xv));
430   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
431   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
432   PetscCall(PetscLogGpuTimeEnd());
433   PetscFunctionReturn(PETSC_SUCCESS);
434 }
435 
436 /* y = A^H x */
437 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
438 {
439   Mat_SeqAIJKokkos          *aijkok;
440   const char                *mode;
441   ConstPetscScalarKokkosView xv;
442   PetscScalarKokkosView      yv;
443   KokkosCsrMatrix            csrmat;
444 
445   PetscFunctionBegin;
446   PetscCall(PetscLogGpuTimeBegin());
447   PetscCall(MatSeqAIJKokkosSyncDevice(A));
448   PetscCall(VecGetKokkosView(xx, &xv));
449   PetscCall(VecGetKokkosViewWrite(yy, &yv));
450   if (A->form_explicit_transpose) {
451     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
452     mode = "N";
453   } else {
454     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
455     csrmat = aijkok->csrmat;
456     mode   = "C";
457   }
458   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
459   PetscCall(VecRestoreKokkosView(xx, &xv));
460   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
461   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
462   PetscCall(PetscLogGpuTimeEnd());
463   PetscFunctionReturn(PETSC_SUCCESS);
464 }
465 
466 /* z = A x + y */
467 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
468 {
469   Mat_SeqAIJKokkos          *aijkok;
470   ConstPetscScalarKokkosView xv;
471   PetscScalarKokkosView      zv;
472 
473   PetscFunctionBegin;
474   PetscCall(PetscLogGpuTimeBegin());
475   PetscCall(MatSeqAIJKokkosSyncDevice(A));
476   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
477   PetscCall(VecGetKokkosView(xx, &xv));
478   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
479   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
480   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
481   PetscCall(VecRestoreKokkosView(xx, &xv));
482   PetscCall(VecRestoreKokkosView(zz, &zv));
483   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
484   PetscCall(PetscLogGpuTimeEnd());
485   PetscFunctionReturn(PETSC_SUCCESS);
486 }
487 
488 /* z = A^T x + y */
489 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
490 {
491   Mat_SeqAIJKokkos          *aijkok;
492   const char                *mode;
493   ConstPetscScalarKokkosView xv;
494   PetscScalarKokkosView      zv;
495   KokkosCsrMatrix            csrmat;
496 
497   PetscFunctionBegin;
498   PetscCall(PetscLogGpuTimeBegin());
499   PetscCall(MatSeqAIJKokkosSyncDevice(A));
500   if (zz != yy) PetscCall(VecCopy(yy, zz));
501   PetscCall(VecGetKokkosView(xx, &xv));
502   PetscCall(VecGetKokkosView(zz, &zv));
503   if (A->form_explicit_transpose) {
504     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
505     mode = "N";
506   } else {
507     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
508     csrmat = aijkok->csrmat;
509     mode   = "T";
510   }
511   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
512   PetscCall(VecRestoreKokkosView(xx, &xv));
513   PetscCall(VecRestoreKokkosView(zz, &zv));
514   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
515   PetscCall(PetscLogGpuTimeEnd());
516   PetscFunctionReturn(PETSC_SUCCESS);
517 }
518 
519 /* z = A^H x + y */
520 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
521 {
522   Mat_SeqAIJKokkos          *aijkok;
523   const char                *mode;
524   ConstPetscScalarKokkosView xv;
525   PetscScalarKokkosView      zv;
526   KokkosCsrMatrix            csrmat;
527 
528   PetscFunctionBegin;
529   PetscCall(PetscLogGpuTimeBegin());
530   PetscCall(MatSeqAIJKokkosSyncDevice(A));
531   if (zz != yy) PetscCall(VecCopy(yy, zz));
532   PetscCall(VecGetKokkosView(xx, &xv));
533   PetscCall(VecGetKokkosView(zz, &zv));
534   if (A->form_explicit_transpose) {
535     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
536     mode = "N";
537   } else {
538     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
539     csrmat = aijkok->csrmat;
540     mode   = "C";
541   }
542   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
543   PetscCall(VecRestoreKokkosView(xx, &xv));
544   PetscCall(VecRestoreKokkosView(zz, &zv));
545   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
546   PetscCall(PetscLogGpuTimeEnd());
547   PetscFunctionReturn(PETSC_SUCCESS);
548 }
549 
550 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
551 {
552   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
553 
554   PetscFunctionBegin;
555   switch (op) {
556   case MAT_FORM_EXPLICIT_TRANSPOSE:
557     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
558     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
559     A->form_explicit_transpose = flg;
560     break;
561   default:
562     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
563     break;
564   }
565   PetscFunctionReturn(PETSC_SUCCESS);
566 }
567 
568 /* Depending on reuse, either build a new mat, or use the existing mat */
569 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
570 {
571   Mat_SeqAIJ *aseq;
572 
573   PetscFunctionBegin;
574   PetscCall(PetscKokkosInitializeCheck());
575   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
576     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
577     PetscCall(MatSetType(*newmat, mtype));
578   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
579     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
580   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
581     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
582     PetscCall(PetscFree(A->defaultvectype));
583     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
584     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
585     PetscCall(MatSetOps_SeqAIJKokkos(A));
586     aseq = static_cast<Mat_SeqAIJ *>(A->data);
587     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
588       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
589       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
590     }
591   }
592   PetscFunctionReturn(PETSC_SUCCESS);
593 }
594 
595 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
596    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
597  */
598 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
599 {
600   Mat_SeqAIJ       *bseq;
601   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
602   Mat               mat;
603 
604   PetscFunctionBegin;
605   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
606   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
607   mat = *B;
608   if (A->assembled) {
609     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
610     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
611     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
612     /* Now copy values to B if needed */
613     if (dupOption == MAT_COPY_VALUES) {
614       if (akok->a_dual.need_sync_device()) {
615         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
616         bkok->a_dual.modify_host();
617       } else { /* If device has the latest data, we only copy data on device */
618         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
619         bkok->a_dual.modify_device();
620       }
621     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
622       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
623       bkok->a_dual.modify_host();
624     }
625     mat->spptr = bkok;
626   }
627 
628   PetscCall(PetscFree(mat->defaultvectype));
629   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
630   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
631   PetscCall(MatSetOps_SeqAIJKokkos(mat));
632   PetscFunctionReturn(PETSC_SUCCESS);
633 }
634 
635 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
636 {
637   Mat               At;
638   KokkosCsrMatrix   internT;
639   Mat_SeqAIJKokkos *atkok, *bkok;
640 
641   PetscFunctionBegin;
642   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
643   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
644   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
645     /* Deep copy internT, as we want to isolate the internal transpose */
646     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
647     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
648     if (reuse == MAT_INITIAL_MATRIX) *B = At;
649     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
650   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
651     if ((*B)->assembled) {
652       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
653       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
654       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
655     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
656       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
657       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
658       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
659       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
660       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
661     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
662   }
663   PetscFunctionReturn(PETSC_SUCCESS);
664 }
665 
666 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
667 {
668   Mat_SeqAIJKokkos *aijkok;
669 
670   PetscFunctionBegin;
671   if (A->factortype == MAT_FACTOR_NONE) {
672     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
673     delete aijkok;
674   } else {
675     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
676   }
677   A->spptr = NULL;
678   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
679   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
680   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
681 #if defined(PETSC_HAVE_HYPRE)
682   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
683 #endif
684   PetscCall(MatDestroy_SeqAIJ(A));
685   PetscFunctionReturn(PETSC_SUCCESS);
686 }
687 
688 /*MC
689    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
690 
691    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
692 
693    Options Database Key:
694 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
695 
696   Level: beginner
697 
698 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
699 M*/
700 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
701 {
702   PetscFunctionBegin;
703   PetscCall(PetscKokkosInitializeCheck());
704   PetscCall(MatCreate_SeqAIJ(A));
705   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
706   PetscFunctionReturn(PETSC_SUCCESS);
707 }
708 
709 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
710 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
711 {
712   Mat_SeqAIJ         *a, *b;
713   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
714   MatScalarKokkosView aa, ba, ca;
715   MatRowMapKokkosView ai, bi, ci;
716   MatColIdxKokkosView aj, bj, cj;
717   PetscInt            m, n, nnz, aN;
718 
719   PetscFunctionBegin;
720   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
721   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
722   PetscAssertPointer(C, 4);
723   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
724   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
725   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
726   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
727 
728   PetscCall(MatSeqAIJKokkosSyncDevice(A));
729   PetscCall(MatSeqAIJKokkosSyncDevice(B));
730   a    = static_cast<Mat_SeqAIJ *>(A->data);
731   b    = static_cast<Mat_SeqAIJ *>(B->data);
732   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
733   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
734   aa   = akok->a_dual.view_device();
735   ai   = akok->i_dual.view_device();
736   ba   = bkok->a_dual.view_device();
737   bi   = bkok->i_dual.view_device();
738   m    = A->rmap->n; /* M, N and nnz of C */
739   n    = A->cmap->n + B->cmap->n;
740   nnz  = a->nz + b->nz;
741   aN   = A->cmap->n; /* N of A */
742   if (reuse == MAT_INITIAL_MATRIX) {
743     aj           = akok->j_dual.view_device();
744     bj           = bkok->j_dual.view_device();
745     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
746     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
747     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
748     ca           = ca_dual.view_device();
749     ci           = ci_dual.view_device();
750     cj           = cj_dual.view_device();
751 
752     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
753     Kokkos::parallel_for(
754       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
755         PetscInt i       = t.league_rank(); /* row i */
756         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
757 
758         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
759                                                    ci(i) = coffset;
760                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
761         });
762 
763         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
764           if (k < alen) {
765             ca(coffset + k) = aa(ai(i) + k);
766             cj(coffset + k) = aj(ai(i) + k);
767           } else {
768             ca(coffset + k) = ba(bi(i) + k - alen);
769             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
770           }
771         });
772       });
773     ca_dual.modify_device();
774     ci_dual.modify_device();
775     cj_dual.modify_device();
776     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
777     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
778   } else if (reuse == MAT_REUSE_MATRIX) {
779     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
780     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
781     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
782     ca   = ckok->a_dual.view_device();
783     ci   = ckok->i_dual.view_device();
784 
785     Kokkos::parallel_for(
786       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
787         PetscInt i    = t.league_rank(); /* row i */
788         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
789         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
790           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
791           else ca(ci(i) + k) = ba(bi(i) + k - alen);
792         });
793       });
794     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
795   }
796   PetscFunctionReturn(PETSC_SUCCESS);
797 }
798 
799 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
800 {
801   PetscFunctionBegin;
802   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
803   PetscFunctionReturn(PETSC_SUCCESS);
804 }
805 
806 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
807 {
808   Mat_Product                 *product = C->product;
809   Mat                          A, B;
810   bool                         transA, transB; /* use bool, since KK needs this type */
811   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
812   Mat_SeqAIJ                  *c;
813   MatProductData_SeqAIJKokkos *pdata;
814   KokkosCsrMatrix              csrmatA, csrmatB;
815 
816   PetscFunctionBegin;
817   MatCheckProduct(C, 1);
818   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
819   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
820 
821   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
822   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
823   // we still do numeric.
824   if (pdata->reusesym) { // numeric reuses results from symbolic
825     pdata->reusesym = PETSC_FALSE;
826     PetscFunctionReturn(PETSC_SUCCESS);
827   }
828 
829   switch (product->type) {
830   case MATPRODUCT_AB:
831     transA = false;
832     transB = false;
833     break;
834   case MATPRODUCT_AtB:
835     transA = true;
836     transB = false;
837     break;
838   case MATPRODUCT_ABt:
839     transA = false;
840     transB = true;
841     break;
842   default:
843     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
844   }
845 
846   A = product->A;
847   B = product->B;
848   PetscCall(MatSeqAIJKokkosSyncDevice(A));
849   PetscCall(MatSeqAIJKokkosSyncDevice(B));
850   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
851   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
852   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
853 
854   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
855 
856   csrmatA = akok->csrmat;
857   csrmatB = bkok->csrmat;
858 
859   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
860   if (transA) {
861     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
862     transA = false;
863   }
864 
865   if (transB) {
866     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
867     transB = false;
868   }
869   PetscCall(PetscLogGpuTimeBegin());
870   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
871 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
872   auto spgemmHandle = pdata->kh.get_spgemm_handle();
873   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
874 #endif
875 
876   PetscCall(PetscLogGpuTimeEnd());
877   PetscCall(MatSeqAIJKokkosModifyDevice(C));
878   /* shorter version of MatAssemblyEnd_SeqAIJ */
879   c = (Mat_SeqAIJ *)C->data;
880   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
881   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
882   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
883   c->reallocs         = 0;
884   C->info.mallocs     = 0;
885   C->info.nz_unneeded = 0;
886   C->assembled = C->was_assembled = PETSC_TRUE;
887   C->num_ass++;
888   PetscFunctionReturn(PETSC_SUCCESS);
889 }
890 
891 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
892 {
893   Mat_Product                 *product = C->product;
894   MatProductType               ptype;
895   Mat                          A, B;
896   bool                         transA, transB;
897   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
898   MatProductData_SeqAIJKokkos *pdata;
899   MPI_Comm                     comm;
900   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
901 
902   PetscFunctionBegin;
903   MatCheckProduct(C, 1);
904   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
905   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
906   A = product->A;
907   B = product->B;
908   PetscCall(MatSeqAIJKokkosSyncDevice(A));
909   PetscCall(MatSeqAIJKokkosSyncDevice(B));
910   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
911   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
912   csrmatA = akok->csrmat;
913   csrmatB = bkok->csrmat;
914 
915   ptype = product->type;
916   // Take advantage of the symmetry if true
917   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
918     ptype                                          = MATPRODUCT_AB;
919     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
920   }
921   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
922     ptype                                          = MATPRODUCT_AB;
923     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
924   }
925 
926   switch (ptype) {
927   case MATPRODUCT_AB:
928     transA = false;
929     transB = false;
930     PetscCall(MatSetBlockSizesFromMats(C, A, B));
931     break;
932   case MATPRODUCT_AtB:
933     transA = true;
934     transB = false;
935     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
936     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
937     break;
938   case MATPRODUCT_ABt:
939     transA = false;
940     transB = true;
941     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
942     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
943     break;
944   default:
945     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
946   }
947   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
948   pdata->reusesym = product->api_user;
949 
950   /* TODO: add command line options to select spgemm algorithms */
951   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
952 
953   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
954 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
955   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
956   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
957   #endif
958 #endif
959   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
960 
961   PetscCall(PetscLogGpuTimeBegin());
962   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
963   if (transA) {
964     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
965     transA = false;
966   }
967 
968   if (transB) {
969     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
970     transB = false;
971   }
972 
973   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
974   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
975     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
976     calling new Mat_SeqAIJKokkos().
977     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
978   */
979   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
980 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
981   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
982   auto spgemmHandle = pdata->kh.get_spgemm_handle();
983   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
984 #endif
985   PetscCall(PetscLogGpuTimeEnd());
986 
987   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
988   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
989   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
990   PetscFunctionReturn(PETSC_SUCCESS);
991 }
992 
993 /* handles sparse matrix matrix ops */
994 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
995 {
996   Mat_Product *product = mat->product;
997   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
998 
999   PetscFunctionBegin;
1000   MatCheckProduct(mat, 1);
1001   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
1002   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
1003   if (Biskok && Ciskok) {
1004     switch (product->type) {
1005     case MATPRODUCT_AB:
1006     case MATPRODUCT_AtB:
1007     case MATPRODUCT_ABt:
1008       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1009       break;
1010     case MATPRODUCT_PtAP:
1011     case MATPRODUCT_RARt:
1012     case MATPRODUCT_ABC:
1013       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1014       break;
1015     default:
1016       break;
1017     }
1018   } else { /* fallback for AIJ */
1019     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1020   }
1021   PetscFunctionReturn(PETSC_SUCCESS);
1022 }
1023 
1024 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1025 {
1026   Mat_SeqAIJKokkos *aijkok;
1027 
1028   PetscFunctionBegin;
1029   PetscCall(PetscLogGpuTimeBegin());
1030   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1031   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1032   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1033   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1034   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1035   PetscCall(PetscLogGpuTimeEnd());
1036   PetscFunctionReturn(PETSC_SUCCESS);
1037 }
1038 
1039 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1040 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1041 {
1042   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1043 
1044   PetscFunctionBegin;
1045   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1046     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1047 
1048     PetscCall(PetscLogGpuTimeBegin());
1049     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1050     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1051     const auto &Aa     = aijkok->a_dual.view_device();
1052     const auto &Adiag  = aijkok->diag_dual.view_device();
1053     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1054     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1055     PetscCall(PetscLogGpuFlops(n));
1056     PetscCall(PetscLogGpuTimeEnd());
1057   } else { // need reassembly, very slow!
1058     PetscCall(MatShift_Basic(A, a));
1059   }
1060   PetscFunctionReturn(PETSC_SUCCESS);
1061 }
1062 
1063 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1064 {
1065   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1066 
1067   PetscFunctionBegin;
1068   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1069     ConstPetscScalarKokkosView dv;
1070     PetscInt                   n, nv;
1071 
1072     PetscCall(PetscLogGpuTimeBegin());
1073     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1074     PetscCall(VecGetKokkosView(D, &dv));
1075     PetscCall(VecGetLocalSize(D, &nv));
1076     n = PetscMin(Y->rmap->n, Y->cmap->n);
1077     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1078 
1079     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1080     const auto &Aa     = aijkok->a_dual.view_device();
1081     const auto &Adiag  = aijkok->diag_dual.view_device();
1082     PetscCallCXX(Kokkos::parallel_for(
1083       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1084         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1085         else Aa(Adiag(i)) += dv(i);
1086       }));
1087     PetscCall(VecRestoreKokkosView(D, &dv));
1088     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1089     PetscCall(PetscLogGpuFlops(n));
1090     PetscCall(PetscLogGpuTimeEnd());
1091   } else { // need reassembly, very slow!
1092     PetscCall(MatDiagonalSet_Default(Y, D, is));
1093   }
1094   PetscFunctionReturn(PETSC_SUCCESS);
1095 }
1096 
1097 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1098 {
1099   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1100   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1101   ConstPetscScalarKokkosView lv, rv;
1102 
1103   PetscFunctionBegin;
1104   PetscCall(PetscLogGpuTimeBegin());
1105   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1106   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1107   const auto &Aa     = aijkok->a_dual.view_device();
1108   const auto &Ai     = aijkok->i_dual.view_device();
1109   const auto &Aj     = aijkok->j_dual.view_device();
1110   if (ll) {
1111     PetscCall(VecGetLocalSize(ll, &m));
1112     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1113     PetscCall(VecGetKokkosView(ll, &lv));
1114     PetscCallCXX(Kokkos::parallel_for( // for each row
1115       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1116         PetscInt i   = t.league_rank(); // row i
1117         PetscInt len = Ai(i + 1) - Ai(i);
1118         // scale entries on the row
1119         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1120       }));
1121     PetscCall(VecRestoreKokkosView(ll, &lv));
1122     PetscCall(PetscLogGpuFlops(nz));
1123   }
1124   if (rr) {
1125     PetscCall(VecGetLocalSize(rr, &n));
1126     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1127     PetscCall(VecGetKokkosView(rr, &rv));
1128     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1129       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1130     PetscCall(VecRestoreKokkosView(rr, &lv));
1131     PetscCall(PetscLogGpuFlops(nz));
1132   }
1133   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1134   PetscCall(PetscLogGpuTimeEnd());
1135   PetscFunctionReturn(PETSC_SUCCESS);
1136 }
1137 
1138 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1139 {
1140   Mat_SeqAIJKokkos *aijkok;
1141 
1142   PetscFunctionBegin;
1143   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1144   if (aijkok) { /* Only zero the device if data is already there */
1145     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1146     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1147   } else { /* Might be preallocated but not assembled */
1148     PetscCall(MatZeroEntries_SeqAIJ(A));
1149   }
1150   PetscFunctionReturn(PETSC_SUCCESS);
1151 }
1152 
1153 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1154 {
1155   Mat_SeqAIJKokkos     *aijkok;
1156   PetscInt              n;
1157   PetscScalarKokkosView xv;
1158 
1159   PetscFunctionBegin;
1160   PetscCall(VecGetLocalSize(x, &n));
1161   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1162   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1163 
1164   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1165   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1166 
1167   const auto &Aa    = aijkok->a_dual.view_device();
1168   const auto &Ai    = aijkok->i_dual.view_device();
1169   const auto &Adiag = aijkok->diag_dual.view_device();
1170 
1171   PetscCall(VecGetKokkosViewWrite(x, &xv));
1172   Kokkos::parallel_for(
1173     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1174       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1175       else xv(i) = 0;
1176     });
1177   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1178   PetscFunctionReturn(PETSC_SUCCESS);
1179 }
1180 
1181 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1182 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1183 {
1184   Mat_SeqAIJKokkos *aijkok;
1185 
1186   PetscFunctionBegin;
1187   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1188   PetscAssertPointer(kv, 2);
1189   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1190   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1191   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1192   *kv    = aijkok->a_dual.view_device();
1193   PetscFunctionReturn(PETSC_SUCCESS);
1194 }
1195 
1196 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1197 {
1198   PetscFunctionBegin;
1199   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1200   PetscAssertPointer(kv, 2);
1201   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1202   PetscFunctionReturn(PETSC_SUCCESS);
1203 }
1204 
1205 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1206 {
1207   Mat_SeqAIJKokkos *aijkok;
1208 
1209   PetscFunctionBegin;
1210   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1211   PetscAssertPointer(kv, 2);
1212   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1213   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1214   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1215   *kv    = aijkok->a_dual.view_device();
1216   PetscFunctionReturn(PETSC_SUCCESS);
1217 }
1218 
1219 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1220 {
1221   PetscFunctionBegin;
1222   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1223   PetscAssertPointer(kv, 2);
1224   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1225   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1226   PetscFunctionReturn(PETSC_SUCCESS);
1227 }
1228 
1229 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1230 {
1231   Mat_SeqAIJKokkos *aijkok;
1232 
1233   PetscFunctionBegin;
1234   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1235   PetscAssertPointer(kv, 2);
1236   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1237   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1238   *kv    = aijkok->a_dual.view_device();
1239   PetscFunctionReturn(PETSC_SUCCESS);
1240 }
1241 
1242 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1243 {
1244   PetscFunctionBegin;
1245   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1246   PetscAssertPointer(kv, 2);
1247   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1248   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1249   PetscFunctionReturn(PETSC_SUCCESS);
1250 }
1251 
1252 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1253 {
1254   Mat_SeqAIJKokkos *akok;
1255 
1256   PetscFunctionBegin;
1257   // Create host copies of the input aij
1258   auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1259   auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1260   // Don't copy the vals to the host now
1261   auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1262 
1263   MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1264   // Note we have modified device data so it will copy lazily
1265   a_dual.modify_device();
1266   MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1267   MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1268 
1269   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1270   PetscCall(MatCreate(comm, A));
1271   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1272   PetscFunctionReturn(PETSC_SUCCESS);
1273 }
1274 
1275 /* Computes Y += alpha X */
1276 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1277 {
1278   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1279   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1280   ConstMatScalarKokkosView Xa;
1281   MatScalarKokkosView      Ya;
1282   auto                     exec = PetscGetKokkosExecutionSpace();
1283 
1284   PetscFunctionBegin;
1285   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1286   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1287   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1288   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1289   PetscCall(PetscLogGpuTimeBegin());
1290 
1291   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1292     PetscBool e;
1293     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1294     if (e) {
1295       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1296       if (e) pattern = SAME_NONZERO_PATTERN;
1297     }
1298   }
1299 
1300   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1301     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1302     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1303     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1304   */
1305   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1306   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1307   Xa   = xkok->a_dual.view_device();
1308   Ya   = ykok->a_dual.view_device();
1309 
1310   if (pattern == SAME_NONZERO_PATTERN) {
1311     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1312     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1313   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1314     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1315     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1316 
1317     Kokkos::parallel_for(
1318       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1319         PetscInt i = t.league_rank(); // row i
1320         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1321           // Only one thread works in a team
1322           PetscInt p, q = Yi(i);
1323           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1324             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1325             if (Xj(p) == Yj(q)) {                        // Found it
1326               Ya(q) += alpha * Xa(p);
1327               q++;
1328             } else {
1329             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1330             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1331 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1332               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1333 #else
1334               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1335 #endif
1336             }
1337           }
1338         });
1339       });
1340     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1341   } else { // different nonzero patterns
1342     Mat             Z;
1343     KokkosCsrMatrix zcsr;
1344     KernelHandle    kh;
1345     kh.create_spadd_handle(true); // X, Y are sorted
1346     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1347     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1348     zkok = new Mat_SeqAIJKokkos(zcsr);
1349     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1350     PetscCall(MatHeaderReplace(Y, &Z));
1351     kh.destroy_spadd_handle();
1352   }
1353   PetscCall(PetscLogGpuTimeEnd());
1354   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1355   PetscFunctionReturn(PETSC_SUCCESS);
1356 }
1357 
1358 struct MatCOOStruct_SeqAIJKokkos {
1359   PetscCount           n;
1360   PetscCount           Atot;
1361   PetscInt             nz;
1362   PetscCountKokkosView jmap;
1363   PetscCountKokkosView perm;
1364 
1365   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1366   {
1367     nz   = coo_h->nz;
1368     n    = coo_h->n;
1369     Atot = coo_h->Atot;
1370     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1371     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1372   }
1373 };
1374 
1375 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1376 {
1377   PetscFunctionBegin;
1378   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1379   PetscFunctionReturn(PETSC_SUCCESS);
1380 }
1381 
1382 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1383 {
1384   Mat_SeqAIJKokkos          *akok;
1385   Mat_SeqAIJ                *aseq;
1386   PetscContainer             container_h;
1387   MatCOOStruct_SeqAIJ       *coo_h;
1388   MatCOOStruct_SeqAIJKokkos *coo_d;
1389 
1390   PetscFunctionBegin;
1391   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1392   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1393   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1394   delete akok;
1395   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1396   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1397 
1398   // Copy the COO struct to device
1399   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1400   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1401   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1402 
1403   // Put the COO struct in a container and then attach that to the matrix
1404   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1405   PetscFunctionReturn(PETSC_SUCCESS);
1406 }
1407 
1408 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1409 {
1410   MatScalarKokkosView        Aa;
1411   ConstMatScalarKokkosView   kv;
1412   PetscMemType               memtype;
1413   PetscContainer             container;
1414   MatCOOStruct_SeqAIJKokkos *coo;
1415 
1416   PetscFunctionBegin;
1417   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1418   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1419 
1420   const auto &n    = coo->n;
1421   const auto &Annz = coo->nz;
1422   const auto &jmap = coo->jmap;
1423   const auto &perm = coo->perm;
1424 
1425   PetscCall(PetscGetMemType(v, &memtype));
1426   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1427     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1428   } else {
1429     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1430   }
1431 
1432   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1433   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1434 
1435   PetscCall(PetscLogGpuTimeBegin());
1436   Kokkos::parallel_for(
1437     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1438       PetscScalar sum = 0.0;
1439       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1440       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1441     });
1442   PetscCall(PetscLogGpuTimeEnd());
1443 
1444   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1445   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1446   PetscFunctionReturn(PETSC_SUCCESS);
1447 }
1448 
1449 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1450 {
1451   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1452 
1453   PetscFunctionBegin;
1454   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1455   A->boundtocpu  = PETSC_FALSE;
1456 
1457   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1458   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1459   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1460   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1461   A->ops->scale                     = MatScale_SeqAIJKokkos;
1462   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1463   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1464   A->ops->mult                      = MatMult_SeqAIJKokkos;
1465   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1466   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1467   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1468   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1469   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1470   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1471   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1472   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1473   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1474   A->ops->shift                     = MatShift_SeqAIJKokkos;
1475   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1476   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1477   A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJKokkos;
1478   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1479   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1480   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1481   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1482   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1483   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1484   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1485 
1486   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1487   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1488 #if defined(PETSC_HAVE_HYPRE)
1489   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1490 #endif
1491   PetscFunctionReturn(PETSC_SUCCESS);
1492 }
1493 
1494 /*
1495    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1496 
1497   Input Parameters:
1498 +  A       - the MATSEQAIJKOKKOS matrix
1499 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1500 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1501 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1502 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1503 
1504   Output Parameter:
1505 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1506 */
1507 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1508 {
1509   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1510   PetscInt          nblocks = bs.extent(0) - 1;
1511 
1512   PetscFunctionBegin;
1513   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1514 
1515   // Pull out the diagonal blocks of the matrix and then invert the blocks
1516   auto Aa    = akok->a_dual.view_device();
1517   auto Ai    = akok->i_dual.view_device();
1518   auto Aj    = akok->j_dual.view_device();
1519   auto Adiag = akok->diag_dual.view_device();
1520   // TODO: how to tune the team size?
1521 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1522   auto ts = Kokkos::AUTO();
1523 #else
1524   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1525 #endif
1526   PetscCallCXX(Kokkos::parallel_for(
1527     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1528       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1529       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1530       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1531       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1532       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1533 
1534       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1535         PetscInt i = rstart + r;                                                            // i-th row in A
1536 
1537         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1538           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1539 
1540           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1541             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1542               B(r, c) = 0.0;
1543             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1544               B(r, c) = Aa(first + c);
1545             } else { // this entry does not show up in the CSR
1546               B(r, c) = 0.0;
1547             }
1548           }
1549         } else { // rare case that the diagonal does not exist
1550           const PetscInt begin = Ai(i);
1551           const PetscInt end   = Ai(i + 1);
1552           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1553           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1554             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1555             else if (Aj(j) >= rstart + m) break;
1556           }
1557         }
1558       });
1559 
1560       // LU-decompose B (w/o pivoting) and then invert B
1561       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1562       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1563     }));
1564   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1565   PetscFunctionReturn(PETSC_SUCCESS);
1566 }
1567 
1568 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1569 {
1570   Mat_SeqAIJ *aseq;
1571   PetscInt    i, m, n;
1572   auto        exec = PetscGetKokkosExecutionSpace();
1573 
1574   PetscFunctionBegin;
1575   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1576 
1577   m = akok->nrows();
1578   n = akok->ncols();
1579   PetscCall(MatSetSizes(A, m, n, m, n));
1580   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1581 
1582   /* Set up data structures of A as a MATSEQAIJ */
1583   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1584   aseq = (Mat_SeqAIJ *)A->data;
1585 
1586   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1587   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1588 
1589   aseq->i       = akok->i_host_data();
1590   aseq->j       = akok->j_host_data();
1591   aseq->a       = akok->a_host_data();
1592   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1593   aseq->free_a  = PETSC_FALSE;
1594   aseq->free_ij = PETSC_FALSE;
1595   aseq->nz      = akok->nnz();
1596   aseq->maxnz   = aseq->nz;
1597 
1598   PetscCall(PetscMalloc1(m, &aseq->imax));
1599   PetscCall(PetscMalloc1(m, &aseq->ilen));
1600   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1601 
1602   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1603   akok->nonzerostate = A->nonzerostate;
1604   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1605   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1606   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1607   PetscFunctionReturn(PETSC_SUCCESS);
1608 }
1609 
1610 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1611 {
1612   PetscFunctionBegin;
1613   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1614   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1615   PetscFunctionReturn(PETSC_SUCCESS);
1616 }
1617 
1618 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1619 {
1620   Mat_SeqAIJKokkos *akok;
1621 
1622   PetscFunctionBegin;
1623   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1624   PetscCall(MatCreate(comm, A));
1625   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1626   PetscFunctionReturn(PETSC_SUCCESS);
1627 }
1628 
1629 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1630 
1631    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1632  */
1633 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1634 {
1635   PetscFunctionBegin;
1636   PetscCall(MatCreate(comm, A));
1637   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1638   PetscFunctionReturn(PETSC_SUCCESS);
1639 }
1640 
1641 /*@C
1642   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1643   (the default parallel PETSc format). This matrix will ultimately be handled by
1644   Kokkos for calculations.
1645 
1646   Collective
1647 
1648   Input Parameters:
1649 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1650 . m    - number of rows
1651 . n    - number of columns
1652 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1653 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1654 
1655   Output Parameter:
1656 . A - the matrix
1657 
1658   Level: intermediate
1659 
1660   Notes:
1661   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1662   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1663   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1664 
1665   The AIJ format, also called
1666   compressed row storage, is fully compatible with standard Fortran
1667   storage.  That is, the stored row and column indices can begin at
1668   either one (as in Fortran) or zero.
1669 
1670   Specify the preallocated storage with either `nz` or `nnz` (not both).
1671   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1672   allocation.
1673 
1674 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1675 @*/
1676 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1677 {
1678   PetscFunctionBegin;
1679   PetscCall(PetscKokkosInitializeCheck());
1680   PetscCall(MatCreate(comm, A));
1681   PetscCall(MatSetSizes(*A, m, n, m, n));
1682   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1683   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1684   PetscFunctionReturn(PETSC_SUCCESS);
1685 }
1686 
1687 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1688 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1689 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1690 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1691 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1692 {
1693   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1694   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1695   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1696 
1697   PetscFunctionBegin;
1698   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1699     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1700     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1701     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1702   }
1703   PetscFunctionReturn(PETSC_SUCCESS);
1704 }
1705 
1706 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1707 {
1708   const PetscInt              n         = A->rmap->n;
1709   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1710   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1711   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1712 
1713   PetscFunctionBegin;
1714   if (!factors->transpose_updated) {
1715     if (has_upper) {
1716       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1717         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1718         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1719         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1720       }
1721 
1722       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1723         if (!factors->U) {
1724           Mat_SeqAIJ *seq;
1725 
1726           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1727           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1728 
1729           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1730           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1731           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1732           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1733         } else {
1734           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1735         }
1736         // Copy Ut from host to device
1737         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1738         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1739         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1740       } else { // If U was computed on device, we also compute the transpose there
1741         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1742         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1743                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1744                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1745                                                                                                                                                                                                                                factors->aUt_d));
1746         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1747       }
1748       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1749     }
1750 
1751     // do the same for L with LU
1752     if (has_lower) {
1753       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1754         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1755         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1756         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1757       }
1758 
1759       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1760         if (!factors->L) {
1761           Mat_SeqAIJ *seq;
1762 
1763           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1764           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1765 
1766           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1767           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1768           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1769           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1770         } else {
1771           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1772         }
1773         // Copy Lt from host to device
1774         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1775         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1776         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1777       } else { // If L was computed on device, we also compute the transpose there
1778         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1779         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1780                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1781                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1782                                                                                                                                                                                                                                factors->aLt_d));
1783         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1784       }
1785       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1786     }
1787 
1788     factors->transpose_updated = PETSC_TRUE;
1789   }
1790   PetscFunctionReturn(PETSC_SUCCESS);
1791 }
1792 
1793 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1794 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1795 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1796 {
1797   auto                        exec    = PetscGetKokkosExecutionSpace();
1798   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1799   PetscInt                    m       = A->rmap->n;
1800   PetscScalarKokkosView       D       = factors->D_d;
1801   PetscScalarKokkosView       X, Y, B; // alias
1802   ConstPetscScalarKokkosView  b;
1803   PetscScalarKokkosView       x;
1804   PetscIntKokkosView         &rowperm  = factors->rowperm;
1805   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1806 
1807   PetscFunctionBegin;
1808   PetscCall(PetscLogGpuTimeBegin());
1809   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1810   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1811   PetscCall(VecGetKokkosView(bb, &b));
1812   PetscCall(VecGetKokkosViewWrite(xx, &x));
1813 
1814   // Solve U^T Y = B
1815   if (identity) { // Reorder b with the row permutation
1816     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1817     Y = factors->workVector;
1818   } else {
1819     B = factors->workVector;
1820     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1821     Y = x;
1822   }
1823   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1824 
1825   // Solve diag(D) Y' = Y.
1826   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1827   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1828 
1829   // Solve UX = Y
1830   if (identity) {
1831     X = x;
1832   } else {
1833     X = factors->workVector; // B is not needed anymore
1834   }
1835   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1836 
1837   // Reorder X with the inverse column (row) permutation
1838   if (!identity) {
1839     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1840   }
1841 
1842   PetscCall(VecRestoreKokkosView(bb, &b));
1843   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1844   PetscCall(PetscLogGpuTimeEnd());
1845   PetscFunctionReturn(PETSC_SUCCESS);
1846 }
1847 
1848 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1849 // R and C are represented by rowperm and colperm in factors.
1850 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1851 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1852 {
1853   auto                        exec    = PetscGetKokkosExecutionSpace();
1854   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1855   PetscInt                    m       = A->rmap->n;
1856   PetscScalarKokkosView       X, Y, B; // alias
1857   ConstPetscScalarKokkosView  b;
1858   PetscScalarKokkosView       x;
1859   PetscIntKokkosView         &rowperm      = factors->rowperm;
1860   PetscIntKokkosView         &colperm      = factors->colperm;
1861   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1862   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1863 
1864   PetscFunctionBegin;
1865   PetscCall(PetscLogGpuTimeBegin());
1866   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1867   PetscCall(VecGetKokkosView(bb, &b));
1868   PetscCall(VecGetKokkosViewWrite(xx, &x));
1869 
1870   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1871   if (row_identity) {
1872     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1873     Y = factors->workVector;
1874   } else {
1875     B = factors->workVector;
1876     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1877     Y = x;
1878   }
1879   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1880 
1881   // Solve U C^- x = Y
1882   if (col_identity) {
1883     X = x;
1884   } else {
1885     X = factors->workVector;
1886   }
1887   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1888 
1889   // x = C X; Reorder X with the inverse col permutation
1890   if (!col_identity) {
1891     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1892   }
1893 
1894   PetscCall(VecRestoreKokkosView(bb, &b));
1895   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1896   PetscCall(PetscLogGpuTimeEnd());
1897   PetscFunctionReturn(PETSC_SUCCESS);
1898 }
1899 
1900 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1901 // R and C are represented by rowperm and colperm in factors.
1902 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1903 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1904 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1905 {
1906   auto                        exec    = PetscGetKokkosExecutionSpace();
1907   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1908   PetscInt                    m       = A->rmap->n;
1909   PetscScalarKokkosView       X, Y, B; // alias
1910   ConstPetscScalarKokkosView  b;
1911   PetscScalarKokkosView       x;
1912   PetscIntKokkosView         &rowperm      = factors->rowperm;
1913   PetscIntKokkosView         &colperm      = factors->colperm;
1914   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1915   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1916 
1917   PetscFunctionBegin;
1918   PetscCall(PetscLogGpuTimeBegin());
1919   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1920   PetscCall(VecGetKokkosView(bb, &b));
1921   PetscCall(VecGetKokkosViewWrite(xx, &x));
1922 
1923   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1924   if (col_identity) { // Reorder b with the col permutation
1925     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1926     Y = factors->workVector;
1927   } else {
1928     B = factors->workVector;
1929     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1930     Y = x;
1931   }
1932   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1933 
1934   // Solve L^T X = Y
1935   if (row_identity) {
1936     X = x;
1937   } else {
1938     X = factors->workVector;
1939   }
1940   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1941 
1942   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1943   if (!row_identity) {
1944     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1945   }
1946 
1947   PetscCall(VecRestoreKokkosView(bb, &b));
1948   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1949   PetscCall(PetscLogGpuTimeEnd());
1950   PetscFunctionReturn(PETSC_SUCCESS);
1951 }
1952 
1953 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1954 {
1955   PetscFunctionBegin;
1956   PetscCall(MatSeqAIJKokkosSyncHost(A));
1957   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1958 
1959   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1960     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1961     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1962     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1963     const MatScalar            *Ba = b->a;
1964     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1965 
1966     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1967       // Allocate memory and copy the structure
1968       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1969       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1970       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1971       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1972       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1973       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1974 
1975       PetscInt *Li = factors->iL_h.data();
1976       PetscInt *Lj = factors->jL_h.data();
1977       PetscInt *Ui = factors->iU_h.data();
1978       PetscInt *Uj = factors->jU_h.data();
1979 
1980       Li[0] = Ui[0] = 0;
1981       for (PetscInt i = 0; i < m; i++) {
1982         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1983         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1984 
1985         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1986         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1987 
1988         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1989         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1990 
1991         Li[i + 1] = Li[i] + llen + 1;
1992         Ui[i + 1] = Ui[i] + ulen;
1993       }
1994 
1995       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1996       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1997       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1998       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1999       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
2000       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2001 
2002       // Copy row/col permutation to device
2003       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2004       PetscBool row_identity;
2005       PetscCall(ISIdentity(rowperm, &row_identity));
2006       if (!row_identity) {
2007         const PetscInt *ip;
2008 
2009         PetscCall(ISGetIndices(rowperm, &ip));
2010         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
2011         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2012         PetscCall(ISRestoreIndices(rowperm, &ip));
2013         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2014       }
2015 
2016       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
2017       PetscBool col_identity;
2018       PetscCall(ISIdentity(colperm, &col_identity));
2019       if (!col_identity) {
2020         const PetscInt *ip;
2021 
2022         PetscCall(ISGetIndices(colperm, &ip));
2023         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2024         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2025         PetscCall(ISRestoreIndices(colperm, &ip));
2026         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2027       }
2028 
2029       /* Create sptrsv handles for L, U and their transpose */
2030 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2031       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2032 #else
2033       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2034 #endif
2035       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2036       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2037       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2038       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2039     }
2040 
2041     // Copy the value
2042     for (PetscInt i = 0; i < m; i++) {
2043       PetscInt        llen = Bi[i + 1] - Bi[i];
2044       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2045       const PetscInt *Li   = factors->iL_h.data();
2046       const PetscInt *Ui   = factors->iU_h.data();
2047 
2048       PetscScalar *La = factors->aL_h.data();
2049       PetscScalar *Ua = factors->aU_h.data();
2050 
2051       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2052       La[Li[i] + llen] = 1.0;                      // diagonal entry
2053 
2054       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2055       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2056     }
2057 
2058     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2059     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2060     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2061     factors->transpose_updated         = PETSC_FALSE;
2062     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2063 
2064     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2065     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2066   }
2067 
2068   B->ops->matsolve          = NULL;
2069   B->ops->matsolvetranspose = NULL;
2070   PetscFunctionReturn(PETSC_SUCCESS);
2071 }
2072 
2073 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2074 {
2075   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2076   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2077   PetscInt                    fill_lev = info->levels;
2078 
2079   PetscFunctionBegin;
2080   PetscCall(PetscLogGpuTimeBegin());
2081   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2082   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2083 
2084   auto a_d = aijkok->a_dual.view_device();
2085   auto i_d = aijkok->i_dual.view_device();
2086   auto j_d = aijkok->j_dual.view_device();
2087 
2088   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2089 
2090   B->assembled              = PETSC_TRUE;
2091   B->preallocated           = PETSC_TRUE;
2092   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2093   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2094   B->ops->matsolve          = NULL;
2095   B->ops->matsolvetranspose = NULL;
2096 
2097   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2098   factors->transpose_updated         = PETSC_FALSE;
2099   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2100   /* TODO: log flops, but how to know that? */
2101   PetscCall(PetscLogGpuTimeEnd());
2102   PetscFunctionReturn(PETSC_SUCCESS);
2103 }
2104 
2105 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2106 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2107 {
2108   Mat_SeqAIJKokkos           *aijkok;
2109   Mat_SeqAIJ                 *b;
2110   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2111   PetscInt                    fill_lev = info->levels;
2112   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2113   PetscInt                    n        = A->rmap->n;
2114 
2115   PetscFunctionBegin;
2116   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2117   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2118 
2119   /* Create a spiluk handle and then do symbolic factorization */
2120   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2121   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2122 
2123   auto spiluk_handle = factors->kh.get_spiluk_handle();
2124 
2125   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2126   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2127   Kokkos::realloc(factors->iU_d, n + 1);
2128   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2129 
2130   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2131   auto i_d = aijkok->i_dual.view_device();
2132   auto j_d = aijkok->j_dual.view_device();
2133   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2134   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2135 
2136   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2137   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2138   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2139   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2140 
2141   /* TODO: add options to select sptrsv algorithms */
2142   /* Create sptrsv handles for L, U and their transpose */
2143 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2144   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2145 #else
2146   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2147 #endif
2148 
2149   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2150   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2151   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2152   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2153 
2154   /* Fill fields of the factor matrix B */
2155   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2156   b     = (Mat_SeqAIJ *)B->data;
2157   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2158   B->info.fill_ratio_given  = info->fill;
2159   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2160 
2161   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2162   PetscFunctionReturn(PETSC_SUCCESS);
2163 }
2164 
2165 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2166 {
2167   PetscFunctionBegin;
2168   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2169   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2170   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2171   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2172   PetscFunctionReturn(PETSC_SUCCESS);
2173 }
2174 
2175 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2176 {
2177   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2178 
2179   PetscFunctionBegin;
2180   if (!info->factoronhost) {
2181     PetscCall(ISIdentity(isrow, &row_identity));
2182     PetscCall(ISIdentity(iscol, &col_identity));
2183   }
2184 
2185   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2186   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2187 
2188   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2189     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2190   } else {
2191     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2192     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2193   }
2194   PetscFunctionReturn(PETSC_SUCCESS);
2195 }
2196 
2197 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2198 {
2199   PetscFunctionBegin;
2200   PetscCall(MatSeqAIJKokkosSyncHost(A));
2201   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2202 
2203   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2204     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2205     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2206     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2207     const MatScalar            *Ba = b->a;
2208     PetscInt                    m  = B->rmap->n;
2209 
2210     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2211       // Allocate memory and copy the structure
2212       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2213       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2214       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2215       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2216       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2217       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2218 
2219       // Build jU_h from the skewed Aj
2220       PetscInt *Uj = factors->jU_h.data();
2221       for (PetscInt i = 0; i < m; i++) {
2222         PetscInt ulen = Bi[i + 1] - Bi[i];
2223         Uj[Bi[i]]     = i;                                              // diagonal entry
2224         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2225       }
2226 
2227       // Copy iU, jU to device
2228       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2229       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2230 
2231       // Copy row/col permutation to device
2232       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2233       PetscBool row_identity;
2234       PetscCall(ISIdentity(rowperm, &row_identity));
2235       if (!row_identity) {
2236         const PetscInt *ip;
2237 
2238         PetscCall(ISGetIndices(rowperm, &ip));
2239         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2240         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2241         PetscCall(ISRestoreIndices(rowperm, &ip));
2242         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2243       }
2244 
2245       // Create sptrsv handles for U and U^T
2246 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2247       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2248 #else
2249       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2250 #endif
2251       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2252       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2253     }
2254     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2255     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2256     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2257 
2258     // Copy the value
2259     PetscScalar *Ua = factors->aU_h.data();
2260     PetscScalar *D  = factors->D_h.data();
2261     for (PetscInt i = 0; i < m; i++) {
2262       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2263       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2264       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2265     }
2266     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2267     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2268 
2269     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2270     factors->transpose_updated         = PETSC_FALSE;
2271   }
2272 
2273   B->ops->matsolve          = NULL;
2274   B->ops->matsolvetranspose = NULL;
2275   PetscFunctionReturn(PETSC_SUCCESS);
2276 }
2277 
2278 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2279 {
2280   PetscFunctionBegin;
2281   if (info->solveonhost) {
2282     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2283     PetscCall(MatSetType(B, MATSEQSBAIJ));
2284     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2285   }
2286 
2287   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2288 
2289   if (!info->solveonhost) {
2290     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2291     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2292     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2293     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2294   }
2295   PetscFunctionReturn(PETSC_SUCCESS);
2296 }
2297 
2298 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2299 {
2300   PetscFunctionBegin;
2301   if (info->solveonhost) {
2302     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2303     PetscCall(MatSetType(B, MATSEQSBAIJ));
2304     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2305   }
2306 
2307   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2308 
2309   if (!info->solveonhost) {
2310     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2311     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2312     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2313     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2314   }
2315   PetscFunctionReturn(PETSC_SUCCESS);
2316 }
2317 
2318 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2319 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2320 {
2321   PetscFunctionBegin;
2322   *type = MATSOLVERKOKKOS;
2323   PetscFunctionReturn(PETSC_SUCCESS);
2324 }
2325 
2326 /*MC
2327   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2328   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2329 
2330   Level: beginner
2331 
2332 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2333 M*/
2334 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2335 {
2336   PetscInt n = A->rmap->n;
2337   MPI_Comm comm;
2338 
2339   PetscFunctionBegin;
2340   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2341   PetscCall(MatCreate(comm, B));
2342   PetscCall(MatSetSizes(*B, n, n, n, n));
2343   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2344   (*B)->factortype = ftype;
2345   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2346   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2347   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2348 
2349   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2350     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2351     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2352     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2353     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2354     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2355   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2356     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2357     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2358     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2359     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2360   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2361 
2362   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2363   (*B)->canuseordering = PETSC_TRUE;
2364   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2365   PetscFunctionReturn(PETSC_SUCCESS);
2366 }
2367 
2368 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2369 {
2370   PetscFunctionBegin;
2371   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2372   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2373   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2374   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2375   PetscFunctionReturn(PETSC_SUCCESS);
2376 }
2377 
2378 /* Utility to print out a KokkosCsrMatrix for debugging */
2379 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2380 {
2381   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2382   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2383   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2384   const PetscInt    *i  = iv.data();
2385   const PetscInt    *j  = jv.data();
2386   const PetscScalar *a  = av.data();
2387   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2388 
2389   PetscFunctionBegin;
2390   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2391   for (PetscInt k = 0; k < m; k++) {
2392     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2393     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2394     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2395   }
2396   PetscFunctionReturn(PETSC_SUCCESS);
2397 }
2398