xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 07425a8d4172ec73b7b53c5ce4d6ba1b92fe45cf)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 #define DISABLE_CUSPARSE_DEPRECATED
20 #include <KokkosSparse_spmv.hpp>
21 
22 #include <KokkosSparse_spiluk.hpp>
23 #include <KokkosSparse_sptrsv.hpp>
24 #include <KokkosSparse_spgemm.hpp>
25 #include <KokkosSparse_spadd.hpp>
26 #include <KokkosBatched_LU_Decl.hpp>
27 #include <KokkosBatched_InverseLU_Decl.hpp>
28 
29 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
30 
31 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
32   #include <KokkosSparse_Utils.hpp>
33 using KokkosSparse::sort_crs_matrix;
34 using KokkosSparse::Impl::transpose_matrix;
35 #else
36   #include <KokkosKernels_Sorting.hpp>
37 using KokkosKernels::sort_crs_matrix;
38 using KokkosKernels::Impl::transpose_matrix;
39 #endif
40 
41 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
42 using KokkosSparse::spiluk_symbolic;
43 using KokkosSparse::spiluk_numeric;
44 using KokkosSparse::sptrsv_symbolic;
45 using KokkosSparse::sptrsv_solve;
46 using KokkosSparse::Experimental::SPTRSVAlgorithm;
47 using KokkosSparse::Experimental::SPILUKAlgorithm;
48 #else
49 using KokkosSparse::Experimental::spiluk_symbolic;
50 using KokkosSparse::Experimental::spiluk_numeric;
51 using KokkosSparse::Experimental::sptrsv_symbolic;
52 using KokkosSparse::Experimental::sptrsv_solve;
53 using KokkosSparse::Experimental::SPTRSVAlgorithm;
54 using KokkosSparse::Experimental::SPILUKAlgorithm;
55 #endif
56 
57 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
58 
59 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
60    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
61    In the latter case, it is important to set a_dual's sync state correctly.
62  */
63 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
64 {
65   Mat_SeqAIJ       *aijseq;
66   Mat_SeqAIJKokkos *aijkok;
67 
68   PetscFunctionBegin;
69   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
70   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
71 
72   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
73   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
74 
75   /* If aijkok does not exist, we just copy i, j to device.
76      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
77      In both cases, we build a new aijkok structure.
78   */
79   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
80     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
81       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
82       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
83       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
84       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
85       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
86       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
87       aijseq->free_a  = PETSC_TRUE;
88       aijseq->free_ij = PETSC_TRUE;
89       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
90          we have the CsrMatrix on device first and then copy to host, followed by
91          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
92          One could improve it by not using NULL garray.
93       */
94     }
95     delete aijkok;
96     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
97     A->spptr = aijkok;
98   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
99     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
100     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
101     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
102   }
103   PetscFunctionReturn(PETSC_SUCCESS);
104 }
105 
106 /* Sync CSR data to device if not yet */
107 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
108 {
109   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
110 
111   PetscFunctionBegin;
112   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
113   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
114   if (aijkok->a_dual.need_sync_device()) {
115     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
116     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
117     aijkok->hermitian_updated = PETSC_FALSE;
118   }
119   PetscFunctionReturn(PETSC_SUCCESS);
120 }
121 
122 /* Mark the CSR data on device as modified */
123 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
124 {
125   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
126 
127   PetscFunctionBegin;
128   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
129   aijkok->a_dual.clear_sync_state();
130   aijkok->a_dual.modify_device();
131   aijkok->transpose_updated = PETSC_FALSE;
132   aijkok->hermitian_updated = PETSC_FALSE;
133   PetscCall(PetscObjectStateIncrease((PetscObject)A));
134   PetscFunctionReturn(PETSC_SUCCESS);
135 }
136 
137 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
138 {
139   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
140   auto              exec   = PetscGetKokkosExecutionSpace();
141 
142   PetscFunctionBegin;
143   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
144   /* We do not expect one needs factors on host  */
145   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
146   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
147   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
148   PetscFunctionReturn(PETSC_SUCCESS);
149 }
150 
151 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
152 {
153   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
154 
155   PetscFunctionBegin;
156   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
157     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
158     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
159     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
160   */
161   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
162     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
163     *array = aijkok->a_dual.view_host().data();
164   } else { /* Happens when calling MatSetValues on a newly created matrix */
165     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
166   }
167   PetscFunctionReturn(PETSC_SUCCESS);
168 }
169 
170 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
171 {
172   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
173 
174   PetscFunctionBegin;
175   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
180 {
181   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
182 
183   PetscFunctionBegin;
184   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
185     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
186     *array = aijkok->a_dual.view_host().data();
187   } else {
188     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
189   }
190   PetscFunctionReturn(PETSC_SUCCESS);
191 }
192 
193 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
194 {
195   PetscFunctionBegin;
196   *array = NULL;
197   PetscFunctionReturn(PETSC_SUCCESS);
198 }
199 
200 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
201 {
202   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
203 
204   PetscFunctionBegin;
205   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
206     *array = aijkok->a_dual.view_host().data();
207   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
208     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
209   }
210   PetscFunctionReturn(PETSC_SUCCESS);
211 }
212 
213 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
214 {
215   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
216 
217   PetscFunctionBegin;
218   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
219     aijkok->a_dual.clear_sync_state();
220     aijkok->a_dual.modify_host();
221   }
222   PetscFunctionReturn(PETSC_SUCCESS);
223 }
224 
225 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
226 {
227   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
228 
229   PetscFunctionBegin;
230   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
231 
232   if (i) *i = aijkok->i_device_data();
233   if (j) *j = aijkok->j_device_data();
234   if (a) {
235     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
236     *a = aijkok->a_device_data();
237   }
238   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
239   PetscFunctionReturn(PETSC_SUCCESS);
240 }
241 
242 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
243 {
244   PetscFunctionBegin;
245   *mtype = PETSC_MEMTYPE_KOKKOS;
246   PetscFunctionReturn(PETSC_SUCCESS);
247 }
248 
249 /*
250   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
251 
252   Input Parameter:
253 .  A       - the MATSEQAIJKOKKOS matrix
254 
255   Output Parameters:
256 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
257 -  T_d    - the transpose on device, whose value array is allocated but not initialized
258 */
259 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
260 {
261   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
262   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
263   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
264   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
265   MatRowMapType          *Ti = Ti_h.data();
266   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
267   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
268   PetscInt               *Tj   = Tj_h.data();
269   PetscInt               *perm = perm_h.data();
270   PetscInt               *offset;
271 
272   PetscFunctionBegin;
273   // Populate Ti
274   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
275   Ti++;
276   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
277   Ti--;
278   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
279 
280   // Populate Tj and the permutation array
281   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
282   for (PetscInt i = 0; i < m; i++) {
283     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
284       PetscInt r    = Aj[j];                       // row r of T
285       PetscInt disp = Ti[r] + offset[r];
286 
287       Tj[disp]   = i; // col i of T
288       perm[disp] = j;
289       offset[r]++;
290     }
291   }
292   PetscCall(PetscFree(offset));
293 
294   // Sort each row of T, along with the permutation array
295   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
296 
297   // Output perm and T on device
298   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
299   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
300   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
301   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
302   PetscFunctionReturn(PETSC_SUCCESS);
303 }
304 
305 // Generate the transpose on device and cache it internally
306 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
307 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
308 {
309   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
310   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
311   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
312   KokkosCsrMatrix  &T = akok->csrmatT;
313 
314   PetscFunctionBegin;
315   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
316   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
317 
318   const auto &Aa = akok->a_dual.view_device();
319 
320   if (A->symmetric == PETSC_BOOL3_TRUE) {
321     *csrmatT = akok->csrmat;
322   } else {
323     // See if we already have a cached transpose and its value is up to date
324     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
325       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
326         const auto &perm = akok->transpose_perm; // get the permutation array
327         auto       &Ta   = T.values;
328 
329         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
330       }
331     } else { // Generate T of size n x m for the first time
332       MatRowMapKokkosView perm;
333 
334       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
335       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
336       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
337     }
338     akok->transpose_updated = PETSC_TRUE;
339     *csrmatT                = akok->csrmatT;
340   }
341   PetscFunctionReturn(PETSC_SUCCESS);
342 }
343 
344 // Generate the Hermitian on device and cache it internally
345 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
346 {
347   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
348   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
349   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
350   KokkosCsrMatrix  &T = akok->csrmatH;
351 
352   PetscFunctionBegin;
353   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
354   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
355 
356   const auto &Aa = akok->a_dual.view_device();
357 
358   if (A->hermitian == PETSC_BOOL3_TRUE) {
359     *csrmatH = akok->csrmat;
360   } else {
361     // See if we already have a cached hermitian and its value is up to date
362     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
363       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
364         const auto &perm = akok->transpose_perm; // get the permutation array
365         auto       &Ta   = T.values;
366 
367         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
368       }
369     } else { // Generate T of size n x m for the first time
370       MatRowMapKokkosView perm;
371 
372       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
373       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
374       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
375     }
376     akok->hermitian_updated = PETSC_TRUE;
377     *csrmatH                = akok->csrmatH;
378   }
379   PetscFunctionReturn(PETSC_SUCCESS);
380 }
381 
382 /* y = A x */
383 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
384 {
385   Mat_SeqAIJKokkos          *aijkok;
386   ConstPetscScalarKokkosView xv;
387   PetscScalarKokkosView      yv;
388 
389   PetscFunctionBegin;
390   PetscCall(PetscLogGpuTimeBegin());
391   PetscCall(MatSeqAIJKokkosSyncDevice(A));
392   PetscCall(VecGetKokkosView(xx, &xv));
393   PetscCall(VecGetKokkosViewWrite(yy, &yv));
394   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
395   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
396   PetscCall(VecRestoreKokkosView(xx, &xv));
397   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
398   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
399   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
400   PetscCall(PetscLogGpuTimeEnd());
401   PetscFunctionReturn(PETSC_SUCCESS);
402 }
403 
404 /* y = A^T x */
405 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
406 {
407   Mat_SeqAIJKokkos          *aijkok;
408   const char                *mode;
409   ConstPetscScalarKokkosView xv;
410   PetscScalarKokkosView      yv;
411   KokkosCsrMatrix            csrmat;
412 
413   PetscFunctionBegin;
414   PetscCall(PetscLogGpuTimeBegin());
415   PetscCall(MatSeqAIJKokkosSyncDevice(A));
416   PetscCall(VecGetKokkosView(xx, &xv));
417   PetscCall(VecGetKokkosViewWrite(yy, &yv));
418   if (A->form_explicit_transpose) {
419     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
420     mode = "N";
421   } else {
422     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
423     csrmat = aijkok->csrmat;
424     mode   = "T";
425   }
426   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
427   PetscCall(VecRestoreKokkosView(xx, &xv));
428   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
429   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
430   PetscCall(PetscLogGpuTimeEnd());
431   PetscFunctionReturn(PETSC_SUCCESS);
432 }
433 
434 /* y = A^H x */
435 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
436 {
437   Mat_SeqAIJKokkos          *aijkok;
438   const char                *mode;
439   ConstPetscScalarKokkosView xv;
440   PetscScalarKokkosView      yv;
441   KokkosCsrMatrix            csrmat;
442 
443   PetscFunctionBegin;
444   PetscCall(PetscLogGpuTimeBegin());
445   PetscCall(MatSeqAIJKokkosSyncDevice(A));
446   PetscCall(VecGetKokkosView(xx, &xv));
447   PetscCall(VecGetKokkosViewWrite(yy, &yv));
448   if (A->form_explicit_transpose) {
449     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
450     mode = "N";
451   } else {
452     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
453     csrmat = aijkok->csrmat;
454     mode   = "C";
455   }
456   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
457   PetscCall(VecRestoreKokkosView(xx, &xv));
458   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
459   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
460   PetscCall(PetscLogGpuTimeEnd());
461   PetscFunctionReturn(PETSC_SUCCESS);
462 }
463 
464 /* z = A x + y */
465 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
466 {
467   Mat_SeqAIJKokkos          *aijkok;
468   ConstPetscScalarKokkosView xv;
469   PetscScalarKokkosView      zv;
470 
471   PetscFunctionBegin;
472   PetscCall(PetscLogGpuTimeBegin());
473   PetscCall(MatSeqAIJKokkosSyncDevice(A));
474   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
475   PetscCall(VecGetKokkosView(xx, &xv));
476   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
477   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
478   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
479   PetscCall(VecRestoreKokkosView(xx, &xv));
480   PetscCall(VecRestoreKokkosView(zz, &zv));
481   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
482   PetscCall(PetscLogGpuTimeEnd());
483   PetscFunctionReturn(PETSC_SUCCESS);
484 }
485 
486 /* z = A^T x + y */
487 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
488 {
489   Mat_SeqAIJKokkos          *aijkok;
490   const char                *mode;
491   ConstPetscScalarKokkosView xv;
492   PetscScalarKokkosView      zv;
493   KokkosCsrMatrix            csrmat;
494 
495   PetscFunctionBegin;
496   PetscCall(PetscLogGpuTimeBegin());
497   PetscCall(MatSeqAIJKokkosSyncDevice(A));
498   if (zz != yy) PetscCall(VecCopy(yy, zz));
499   PetscCall(VecGetKokkosView(xx, &xv));
500   PetscCall(VecGetKokkosView(zz, &zv));
501   if (A->form_explicit_transpose) {
502     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
503     mode = "N";
504   } else {
505     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
506     csrmat = aijkok->csrmat;
507     mode   = "T";
508   }
509   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
510   PetscCall(VecRestoreKokkosView(xx, &xv));
511   PetscCall(VecRestoreKokkosView(zz, &zv));
512   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
513   PetscCall(PetscLogGpuTimeEnd());
514   PetscFunctionReturn(PETSC_SUCCESS);
515 }
516 
517 /* z = A^H x + y */
518 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
519 {
520   Mat_SeqAIJKokkos          *aijkok;
521   const char                *mode;
522   ConstPetscScalarKokkosView xv;
523   PetscScalarKokkosView      zv;
524   KokkosCsrMatrix            csrmat;
525 
526   PetscFunctionBegin;
527   PetscCall(PetscLogGpuTimeBegin());
528   PetscCall(MatSeqAIJKokkosSyncDevice(A));
529   if (zz != yy) PetscCall(VecCopy(yy, zz));
530   PetscCall(VecGetKokkosView(xx, &xv));
531   PetscCall(VecGetKokkosView(zz, &zv));
532   if (A->form_explicit_transpose) {
533     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
534     mode = "N";
535   } else {
536     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
537     csrmat = aijkok->csrmat;
538     mode   = "C";
539   }
540   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
541   PetscCall(VecRestoreKokkosView(xx, &xv));
542   PetscCall(VecRestoreKokkosView(zz, &zv));
543   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
544   PetscCall(PetscLogGpuTimeEnd());
545   PetscFunctionReturn(PETSC_SUCCESS);
546 }
547 
548 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
549 {
550   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
551 
552   PetscFunctionBegin;
553   switch (op) {
554   case MAT_FORM_EXPLICIT_TRANSPOSE:
555     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
556     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
557     A->form_explicit_transpose = flg;
558     break;
559   default:
560     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
561     break;
562   }
563   PetscFunctionReturn(PETSC_SUCCESS);
564 }
565 
566 /* Depending on reuse, either build a new mat, or use the existing mat */
567 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
568 {
569   Mat_SeqAIJ *aseq;
570 
571   PetscFunctionBegin;
572   PetscCall(PetscKokkosInitializeCheck());
573   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
574     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
575     PetscCall(MatSetType(*newmat, mtype));
576   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
577     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
578   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
579     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
580     PetscCall(PetscFree(A->defaultvectype));
581     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
582     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
583     PetscCall(MatSetOps_SeqAIJKokkos(A));
584     aseq = static_cast<Mat_SeqAIJ *>(A->data);
585     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
586       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
587       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
588     }
589   }
590   PetscFunctionReturn(PETSC_SUCCESS);
591 }
592 
593 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
594    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
595  */
596 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
597 {
598   Mat_SeqAIJ       *bseq;
599   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
600   Mat               mat;
601 
602   PetscFunctionBegin;
603   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
604   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
605   mat = *B;
606   if (A->assembled) {
607     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
608     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
609     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
610     /* Now copy values to B if needed */
611     if (dupOption == MAT_COPY_VALUES) {
612       if (akok->a_dual.need_sync_device()) {
613         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
614         bkok->a_dual.modify_host();
615       } else { /* If device has the latest data, we only copy data on device */
616         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
617         bkok->a_dual.modify_device();
618       }
619     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
620       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
621       bkok->a_dual.modify_host();
622     }
623     mat->spptr = bkok;
624   }
625 
626   PetscCall(PetscFree(mat->defaultvectype));
627   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
628   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
629   PetscCall(MatSetOps_SeqAIJKokkos(mat));
630   PetscFunctionReturn(PETSC_SUCCESS);
631 }
632 
633 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
634 {
635   Mat               At;
636   KokkosCsrMatrix   internT;
637   Mat_SeqAIJKokkos *atkok, *bkok;
638 
639   PetscFunctionBegin;
640   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
641   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
642   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
643     /* Deep copy internT, as we want to isolate the internal transpose */
644     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
645     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
646     if (reuse == MAT_INITIAL_MATRIX) *B = At;
647     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
648   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
649     if ((*B)->assembled) {
650       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
651       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
652       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
653     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
654       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
655       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
656       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
657       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
658       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
659     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
660   }
661   PetscFunctionReturn(PETSC_SUCCESS);
662 }
663 
664 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
665 {
666   Mat_SeqAIJKokkos *aijkok;
667 
668   PetscFunctionBegin;
669   if (A->factortype == MAT_FACTOR_NONE) {
670     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
671     delete aijkok;
672   } else {
673     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
674   }
675   A->spptr = NULL;
676   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
677   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
678   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
679 #if defined(PETSC_HAVE_HYPRE)
680   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
681 #endif
682   PetscCall(MatDestroy_SeqAIJ(A));
683   PetscFunctionReturn(PETSC_SUCCESS);
684 }
685 
686 /*MC
687    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
688 
689    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
690 
691    Options Database Key:
692 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
693 
694   Level: beginner
695 
696 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
697 M*/
698 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
699 {
700   PetscFunctionBegin;
701   PetscCall(PetscKokkosInitializeCheck());
702   PetscCall(MatCreate_SeqAIJ(A));
703   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
704   PetscFunctionReturn(PETSC_SUCCESS);
705 }
706 
707 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
708 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
709 {
710   Mat_SeqAIJ         *a, *b;
711   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
712   MatScalarKokkosView aa, ba, ca;
713   MatRowMapKokkosView ai, bi, ci;
714   MatColIdxKokkosView aj, bj, cj;
715   PetscInt            m, n, nnz, aN;
716 
717   PetscFunctionBegin;
718   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
719   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
720   PetscAssertPointer(C, 4);
721   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
722   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
723   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
724   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
725 
726   PetscCall(MatSeqAIJKokkosSyncDevice(A));
727   PetscCall(MatSeqAIJKokkosSyncDevice(B));
728   a    = static_cast<Mat_SeqAIJ *>(A->data);
729   b    = static_cast<Mat_SeqAIJ *>(B->data);
730   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
731   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
732   aa   = akok->a_dual.view_device();
733   ai   = akok->i_dual.view_device();
734   ba   = bkok->a_dual.view_device();
735   bi   = bkok->i_dual.view_device();
736   m    = A->rmap->n; /* M, N and nnz of C */
737   n    = A->cmap->n + B->cmap->n;
738   nnz  = a->nz + b->nz;
739   aN   = A->cmap->n; /* N of A */
740   if (reuse == MAT_INITIAL_MATRIX) {
741     aj      = akok->j_dual.view_device();
742     bj      = bkok->j_dual.view_device();
743     auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
744     auto ci = MatRowMapKokkosView("i", ai.extent(0));
745     auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
746 
747     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
748     Kokkos::parallel_for(
749       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
750         PetscInt i       = t.league_rank(); /* row i */
751         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
752 
753         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
754                                                    ci(i) = coffset;
755                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
756         });
757 
758         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
759           if (k < alen) {
760             ca(coffset + k) = aa(ai(i) + k);
761             cj(coffset + k) = aj(ai(i) + k);
762           } else {
763             ca(coffset + k) = ba(bi(i) + k - alen);
764             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
765           }
766         });
767       });
768     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
769     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
770   } else if (reuse == MAT_REUSE_MATRIX) {
771     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
772     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
773     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
774     ca   = ckok->a_dual.view_device();
775     ci   = ckok->i_dual.view_device();
776 
777     Kokkos::parallel_for(
778       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
779         PetscInt i    = t.league_rank(); /* row i */
780         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
781         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
782           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
783           else ca(ci(i) + k) = ba(bi(i) + k - alen);
784         });
785       });
786     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
787   }
788   PetscFunctionReturn(PETSC_SUCCESS);
789 }
790 
791 static PetscErrorCode MatProductCtxDestroy_SeqAIJKokkos(void **pdata)
792 {
793   PetscFunctionBegin;
794   delete *reinterpret_cast<MatProductCtx_SeqAIJKokkos **>(pdata);
795   PetscFunctionReturn(PETSC_SUCCESS);
796 }
797 
798 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
799 {
800   Mat_Product                *product = C->product;
801   Mat                         A, B;
802   bool                        transA, transB; /* use bool, since KK needs this type */
803   Mat_SeqAIJKokkos           *akok, *bkok, *ckok;
804   Mat_SeqAIJ                 *c;
805   MatProductCtx_SeqAIJKokkos *pdata;
806   KokkosCsrMatrix             csrmatA, csrmatB;
807 
808   PetscFunctionBegin;
809   MatCheckProduct(C, 1);
810   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
811   pdata = static_cast<MatProductCtx_SeqAIJKokkos *>(C->product->data);
812 
813   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
814   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
815   // we still do numeric.
816   if (pdata->reusesym) { // numeric reuses results from symbolic
817     pdata->reusesym = PETSC_FALSE;
818     PetscFunctionReturn(PETSC_SUCCESS);
819   }
820 
821   switch (product->type) {
822   case MATPRODUCT_AB:
823     transA = false;
824     transB = false;
825     break;
826   case MATPRODUCT_AtB:
827     transA = true;
828     transB = false;
829     break;
830   case MATPRODUCT_ABt:
831     transA = false;
832     transB = true;
833     break;
834   default:
835     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
836   }
837 
838   A = product->A;
839   B = product->B;
840   PetscCall(MatSeqAIJKokkosSyncDevice(A));
841   PetscCall(MatSeqAIJKokkosSyncDevice(B));
842   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
843   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
844   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
845 
846   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
847 
848   csrmatA = akok->csrmat;
849   csrmatB = bkok->csrmat;
850 
851   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
852   if (transA) {
853     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
854     transA = false;
855   }
856 
857   if (transB) {
858     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
859     transB = false;
860   }
861   PetscCall(PetscLogGpuTimeBegin());
862   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
863 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
864   auto spgemmHandle = pdata->kh.get_spgemm_handle();
865   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
866 #endif
867 
868   PetscCall(PetscLogGpuTimeEnd());
869   PetscCall(MatSeqAIJKokkosModifyDevice(C));
870   /* shorter version of MatAssemblyEnd_SeqAIJ */
871   c = (Mat_SeqAIJ *)C->data;
872   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
873   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
874   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
875   c->reallocs         = 0;
876   C->info.mallocs     = 0;
877   C->info.nz_unneeded = 0;
878   C->assembled = C->was_assembled = PETSC_TRUE;
879   C->num_ass++;
880   PetscFunctionReturn(PETSC_SUCCESS);
881 }
882 
883 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
884 {
885   Mat_Product                *product = C->product;
886   MatProductType              ptype;
887   Mat                         A, B;
888   bool                        transA, transB;
889   Mat_SeqAIJKokkos           *akok, *bkok, *ckok;
890   MatProductCtx_SeqAIJKokkos *pdata;
891   MPI_Comm                    comm;
892   KokkosCsrMatrix             csrmatA, csrmatB, csrmatC;
893 
894   PetscFunctionBegin;
895   MatCheckProduct(C, 1);
896   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
897   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
898   A = product->A;
899   B = product->B;
900   PetscCall(MatSeqAIJKokkosSyncDevice(A));
901   PetscCall(MatSeqAIJKokkosSyncDevice(B));
902   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
903   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
904   csrmatA = akok->csrmat;
905   csrmatB = bkok->csrmat;
906 
907   ptype = product->type;
908   // Take advantage of the symmetry if true
909   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
910     ptype                                          = MATPRODUCT_AB;
911     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
912   }
913   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
914     ptype                                          = MATPRODUCT_AB;
915     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
916   }
917 
918   switch (ptype) {
919   case MATPRODUCT_AB:
920     transA = false;
921     transB = false;
922     PetscCall(MatSetBlockSizesFromMats(C, A, B));
923     break;
924   case MATPRODUCT_AtB:
925     transA = true;
926     transB = false;
927     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
928     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
929     break;
930   case MATPRODUCT_ABt:
931     transA = false;
932     transB = true;
933     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
934     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
935     break;
936   default:
937     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
938   }
939   PetscCallCXX(product->data = pdata = new MatProductCtx_SeqAIJKokkos());
940   pdata->reusesym = product->api_user;
941 
942   /* TODO: add command line options to select spgemm algorithms */
943   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
944 
945   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
946 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
947   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
948   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
949   #endif
950 #endif
951   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
952 
953   PetscCall(PetscLogGpuTimeBegin());
954   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
955   if (transA) {
956     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
957     transA = false;
958   }
959 
960   if (transB) {
961     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
962     transB = false;
963   }
964 
965   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
966   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
967     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
968     calling new Mat_SeqAIJKokkos().
969     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
970   */
971   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
972 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
973   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
974   auto spgemmHandle = pdata->kh.get_spgemm_handle();
975   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
976 #endif
977   PetscCall(PetscLogGpuTimeEnd());
978 
979   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
980   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
981   C->product->destroy = MatProductCtxDestroy_SeqAIJKokkos;
982   PetscFunctionReturn(PETSC_SUCCESS);
983 }
984 
985 /* handles sparse matrix matrix ops */
986 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
987 {
988   Mat_Product *product = mat->product;
989   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
990 
991   PetscFunctionBegin;
992   MatCheckProduct(mat, 1);
993   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
994   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
995   if (Biskok && Ciskok) {
996     switch (product->type) {
997     case MATPRODUCT_AB:
998     case MATPRODUCT_AtB:
999     case MATPRODUCT_ABt:
1000       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1001       break;
1002     case MATPRODUCT_PtAP:
1003     case MATPRODUCT_RARt:
1004     case MATPRODUCT_ABC:
1005       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1006       break;
1007     default:
1008       break;
1009     }
1010   } else { /* fallback for AIJ */
1011     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1012   }
1013   PetscFunctionReturn(PETSC_SUCCESS);
1014 }
1015 
1016 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1017 {
1018   Mat_SeqAIJKokkos *aijkok;
1019 
1020   PetscFunctionBegin;
1021   PetscCall(PetscLogGpuTimeBegin());
1022   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1023   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1024   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1025   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1026   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1027   PetscCall(PetscLogGpuTimeEnd());
1028   PetscFunctionReturn(PETSC_SUCCESS);
1029 }
1030 
1031 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1032 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1033 {
1034   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1035 
1036   PetscFunctionBegin;
1037   if (A->assembled && aijseq->diagDense) { // no missing diagonals
1038     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1039 
1040     PetscCall(PetscLogGpuTimeBegin());
1041     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1042     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1043     const auto &Aa     = aijkok->a_dual.view_device();
1044     const auto &Adiag  = aijkok->diag_dual.view_device();
1045     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1046     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1047     PetscCall(PetscLogGpuFlops(n));
1048     PetscCall(PetscLogGpuTimeEnd());
1049   } else { // need reassembly, very slow!
1050     PetscCall(MatShift_Basic(A, a));
1051   }
1052   PetscFunctionReturn(PETSC_SUCCESS);
1053 }
1054 
1055 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1056 {
1057   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1058 
1059   PetscFunctionBegin;
1060   if (Y->assembled && aijseq->diagDense) { // no missing diagonals
1061     ConstPetscScalarKokkosView dv;
1062     PetscInt                   n, nv;
1063 
1064     PetscCall(PetscLogGpuTimeBegin());
1065     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1066     PetscCall(VecGetKokkosView(D, &dv));
1067     PetscCall(VecGetLocalSize(D, &nv));
1068     n = PetscMin(Y->rmap->n, Y->cmap->n);
1069     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1070 
1071     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1072     const auto &Aa     = aijkok->a_dual.view_device();
1073     const auto &Adiag  = aijkok->diag_dual.view_device();
1074     PetscCallCXX(Kokkos::parallel_for(
1075       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1076         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1077         else Aa(Adiag(i)) += dv(i);
1078       }));
1079     PetscCall(VecRestoreKokkosView(D, &dv));
1080     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1081     PetscCall(PetscLogGpuFlops(n));
1082     PetscCall(PetscLogGpuTimeEnd());
1083   } else { // need reassembly, very slow!
1084     PetscCall(MatDiagonalSet_Default(Y, D, is));
1085   }
1086   PetscFunctionReturn(PETSC_SUCCESS);
1087 }
1088 
1089 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1090 {
1091   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1092   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1093   ConstPetscScalarKokkosView lv, rv;
1094 
1095   PetscFunctionBegin;
1096   PetscCall(PetscLogGpuTimeBegin());
1097   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1098   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1099   const auto &Aa     = aijkok->a_dual.view_device();
1100   const auto &Ai     = aijkok->i_dual.view_device();
1101   const auto &Aj     = aijkok->j_dual.view_device();
1102   if (ll) {
1103     PetscCall(VecGetLocalSize(ll, &m));
1104     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1105     PetscCall(VecGetKokkosView(ll, &lv));
1106     PetscCallCXX(Kokkos::parallel_for( // for each row
1107       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1108         PetscInt i   = t.league_rank(); // row i
1109         PetscInt len = Ai(i + 1) - Ai(i);
1110         // scale entries on the row
1111         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1112       }));
1113     PetscCall(VecRestoreKokkosView(ll, &lv));
1114     PetscCall(PetscLogGpuFlops(nz));
1115   }
1116   if (rr) {
1117     PetscCall(VecGetLocalSize(rr, &n));
1118     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1119     PetscCall(VecGetKokkosView(rr, &rv));
1120     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1121       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1122     PetscCall(VecRestoreKokkosView(rr, &lv));
1123     PetscCall(PetscLogGpuFlops(nz));
1124   }
1125   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1126   PetscCall(PetscLogGpuTimeEnd());
1127   PetscFunctionReturn(PETSC_SUCCESS);
1128 }
1129 
1130 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1131 {
1132   Mat_SeqAIJKokkos *aijkok;
1133 
1134   PetscFunctionBegin;
1135   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1136   if (aijkok) { /* Only zero the device if data is already there */
1137     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1138     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1139   } else { /* Might be preallocated but not assembled */
1140     PetscCall(MatZeroEntries_SeqAIJ(A));
1141   }
1142   PetscFunctionReturn(PETSC_SUCCESS);
1143 }
1144 
1145 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1146 {
1147   Mat_SeqAIJKokkos     *aijkok;
1148   PetscInt              n;
1149   PetscScalarKokkosView xv;
1150 
1151   PetscFunctionBegin;
1152   PetscCall(VecGetLocalSize(x, &n));
1153   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1154   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1155 
1156   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1157   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1158 
1159   const auto &Aa    = aijkok->a_dual.view_device();
1160   const auto &Ai    = aijkok->i_dual.view_device();
1161   const auto &Adiag = aijkok->diag_dual.view_device();
1162 
1163   PetscCall(VecGetKokkosViewWrite(x, &xv));
1164   Kokkos::parallel_for(
1165     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1166       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1167       else xv(i) = 0;
1168     });
1169   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1170   PetscFunctionReturn(PETSC_SUCCESS);
1171 }
1172 
1173 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1174 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1175 {
1176   Mat_SeqAIJKokkos *aijkok;
1177 
1178   PetscFunctionBegin;
1179   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1180   PetscAssertPointer(kv, 2);
1181   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1182   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1183   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1184   *kv    = aijkok->a_dual.view_device();
1185   PetscFunctionReturn(PETSC_SUCCESS);
1186 }
1187 
1188 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1189 {
1190   PetscFunctionBegin;
1191   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1192   PetscAssertPointer(kv, 2);
1193   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1194   PetscFunctionReturn(PETSC_SUCCESS);
1195 }
1196 
1197 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1198 {
1199   Mat_SeqAIJKokkos *aijkok;
1200 
1201   PetscFunctionBegin;
1202   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1203   PetscAssertPointer(kv, 2);
1204   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1205   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1206   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1207   *kv    = aijkok->a_dual.view_device();
1208   PetscFunctionReturn(PETSC_SUCCESS);
1209 }
1210 
1211 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1212 {
1213   PetscFunctionBegin;
1214   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1215   PetscAssertPointer(kv, 2);
1216   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1217   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1218   PetscFunctionReturn(PETSC_SUCCESS);
1219 }
1220 
1221 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1222 {
1223   Mat_SeqAIJKokkos *aijkok;
1224 
1225   PetscFunctionBegin;
1226   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1227   PetscAssertPointer(kv, 2);
1228   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1229   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1230   *kv    = aijkok->a_dual.view_device();
1231   PetscFunctionReturn(PETSC_SUCCESS);
1232 }
1233 
1234 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1235 {
1236   PetscFunctionBegin;
1237   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1238   PetscAssertPointer(kv, 2);
1239   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1240   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1241   PetscFunctionReturn(PETSC_SUCCESS);
1242 }
1243 
1244 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1245 {
1246   Mat_SeqAIJKokkos *akok;
1247 
1248   PetscFunctionBegin;
1249   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1250   PetscCall(MatCreate(comm, A));
1251   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1252   PetscFunctionReturn(PETSC_SUCCESS);
1253 }
1254 
1255 /* Computes Y += alpha X */
1256 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1257 {
1258   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1259   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1260   ConstMatScalarKokkosView Xa;
1261   MatScalarKokkosView      Ya;
1262   auto                     exec = PetscGetKokkosExecutionSpace();
1263 
1264   PetscFunctionBegin;
1265   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1266   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1267   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1268   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1269   PetscCall(PetscLogGpuTimeBegin());
1270 
1271   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1272     PetscBool e;
1273     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1274     if (e) {
1275       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1276       if (e) pattern = SAME_NONZERO_PATTERN;
1277     }
1278   }
1279 
1280   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1281     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1282     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1283     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1284   */
1285   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1286   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1287   Xa   = xkok->a_dual.view_device();
1288   Ya   = ykok->a_dual.view_device();
1289 
1290   if (pattern == SAME_NONZERO_PATTERN) {
1291     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1292     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1293   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1294     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1295     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1296 
1297     Kokkos::parallel_for(
1298       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1299         PetscInt i = t.league_rank(); // row i
1300         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1301           // Only one thread works in a team
1302           PetscInt p, q = Yi(i);
1303           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1304             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1305             if (Xj(p) == Yj(q)) {                        // Found it
1306               Ya(q) += alpha * Xa(p);
1307               q++;
1308             } else {
1309             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1310             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1311 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1312               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1313 #else
1314               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1315 #endif
1316             }
1317           }
1318         });
1319       });
1320     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1321   } else { // different nonzero patterns
1322     Mat             Z;
1323     KokkosCsrMatrix zcsr;
1324     KernelHandle    kh;
1325     kh.create_spadd_handle(true); // X, Y are sorted
1326     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1327     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1328     zkok = new Mat_SeqAIJKokkos(zcsr);
1329     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1330     PetscCall(MatHeaderReplace(Y, &Z));
1331     kh.destroy_spadd_handle();
1332   }
1333   PetscCall(PetscLogGpuTimeEnd());
1334   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1335   PetscFunctionReturn(PETSC_SUCCESS);
1336 }
1337 
1338 struct MatCOOStruct_SeqAIJKokkos {
1339   PetscCount           n;
1340   PetscCount           Atot;
1341   PetscInt             nz;
1342   PetscCountKokkosView jmap;
1343   PetscCountKokkosView perm;
1344 
1345   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1346   {
1347     nz   = coo_h->nz;
1348     n    = coo_h->n;
1349     Atot = coo_h->Atot;
1350     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1351     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1352   }
1353 };
1354 
1355 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1356 {
1357   PetscFunctionBegin;
1358   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1359   PetscFunctionReturn(PETSC_SUCCESS);
1360 }
1361 
1362 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1363 {
1364   Mat_SeqAIJKokkos          *akok;
1365   Mat_SeqAIJ                *aseq;
1366   PetscContainer             container_h;
1367   MatCOOStruct_SeqAIJ       *coo_h;
1368   MatCOOStruct_SeqAIJKokkos *coo_d;
1369 
1370   PetscFunctionBegin;
1371   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1372   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1373   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1374   delete akok;
1375   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1376   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1377 
1378   // Copy the COO struct to device
1379   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1380   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1381   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1382 
1383   // Put the COO struct in a container and then attach that to the matrix
1384   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1385   PetscFunctionReturn(PETSC_SUCCESS);
1386 }
1387 
1388 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1389 {
1390   MatScalarKokkosView        Aa;
1391   ConstMatScalarKokkosView   kv;
1392   PetscMemType               memtype;
1393   PetscContainer             container;
1394   MatCOOStruct_SeqAIJKokkos *coo;
1395 
1396   PetscFunctionBegin;
1397   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1398   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1399 
1400   const auto &n    = coo->n;
1401   const auto &Annz = coo->nz;
1402   const auto &jmap = coo->jmap;
1403   const auto &perm = coo->perm;
1404 
1405   PetscCall(PetscGetMemType(v, &memtype));
1406   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1407     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1408   } else {
1409     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1410   }
1411 
1412   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1413   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1414 
1415   PetscCall(PetscLogGpuTimeBegin());
1416   Kokkos::parallel_for(
1417     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1418       PetscScalar sum = 0.0;
1419       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1420       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1421     });
1422   PetscCall(PetscLogGpuTimeEnd());
1423 
1424   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1425   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1426   PetscFunctionReturn(PETSC_SUCCESS);
1427 }
1428 
1429 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1430 {
1431   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1432 
1433   PetscFunctionBegin;
1434   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1435   A->boundtocpu  = PETSC_FALSE;
1436 
1437   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1438   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1439   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1440   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1441   A->ops->scale                     = MatScale_SeqAIJKokkos;
1442   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1443   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1444   A->ops->mult                      = MatMult_SeqAIJKokkos;
1445   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1446   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1447   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1448   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1449   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1450   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1451   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1452   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1453   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1454   A->ops->shift                     = MatShift_SeqAIJKokkos;
1455   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1456   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1457   A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJKokkos;
1458   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1459   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1460   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1461   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1462   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1463   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1464   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1465 
1466   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1467   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1468 #if defined(PETSC_HAVE_HYPRE)
1469   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1470 #endif
1471   PetscFunctionReturn(PETSC_SUCCESS);
1472 }
1473 
1474 /*
1475    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1476 
1477   Input Parameters:
1478 +  A       - the MATSEQAIJKOKKOS matrix
1479 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1480 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1481 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1482 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1483 
1484   Output Parameter:
1485 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1486 */
1487 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1488 {
1489   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1490   PetscInt          nblocks = bs.extent(0) - 1;
1491 
1492   PetscFunctionBegin;
1493   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1494 
1495   // Pull out the diagonal blocks of the matrix and then invert the blocks
1496   auto Aa    = akok->a_dual.view_device();
1497   auto Ai    = akok->i_dual.view_device();
1498   auto Aj    = akok->j_dual.view_device();
1499   auto Adiag = akok->diag_dual.view_device();
1500   // TODO: how to tune the team size?
1501 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1502   auto ts = Kokkos::AUTO();
1503 #else
1504   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1505 #endif
1506   PetscCallCXX(Kokkos::parallel_for(
1507     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1508       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1509       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1510       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1511       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1512       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1513 
1514       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1515         PetscInt i = rstart + r;                                                            // i-th row in A
1516 
1517         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1518           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1519 
1520           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1521             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1522               B(r, c) = 0.0;
1523             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1524               B(r, c) = Aa(first + c);
1525             } else { // this entry does not show up in the CSR
1526               B(r, c) = 0.0;
1527             }
1528           }
1529         } else { // rare case that the diagonal does not exist
1530           const PetscInt begin = Ai(i);
1531           const PetscInt end   = Ai(i + 1);
1532           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1533           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1534             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1535             else if (Aj(j) >= rstart + m) break;
1536           }
1537         }
1538       });
1539 
1540       // LU-decompose B (w/o pivoting) and then invert B
1541       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1542       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1543     }));
1544   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1545   PetscFunctionReturn(PETSC_SUCCESS);
1546 }
1547 
1548 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1549 {
1550   Mat_SeqAIJ *aseq;
1551   PetscInt    i, m, n;
1552   auto        exec = PetscGetKokkosExecutionSpace();
1553 
1554   PetscFunctionBegin;
1555   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1556 
1557   m = akok->nrows();
1558   n = akok->ncols();
1559   PetscCall(MatSetSizes(A, m, n, m, n));
1560   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1561 
1562   /* Set up data structures of A as a MATSEQAIJ */
1563   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1564   aseq = (Mat_SeqAIJ *)A->data;
1565 
1566   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1567   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1568 
1569   aseq->i       = akok->i_host_data();
1570   aseq->j       = akok->j_host_data();
1571   aseq->a       = akok->a_host_data();
1572   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1573   aseq->free_a  = PETSC_FALSE;
1574   aseq->free_ij = PETSC_FALSE;
1575   aseq->nz      = akok->nnz();
1576   aseq->maxnz   = aseq->nz;
1577 
1578   PetscCall(PetscMalloc1(m, &aseq->imax));
1579   PetscCall(PetscMalloc1(m, &aseq->ilen));
1580   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1581 
1582   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1583   akok->nonzerostate = A->nonzerostate;
1584   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1585   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1586   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1587   PetscFunctionReturn(PETSC_SUCCESS);
1588 }
1589 
1590 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1591 {
1592   PetscFunctionBegin;
1593   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1594   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1595   PetscFunctionReturn(PETSC_SUCCESS);
1596 }
1597 
1598 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1599 {
1600   Mat_SeqAIJKokkos *akok;
1601 
1602   PetscFunctionBegin;
1603   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1604   PetscCall(MatCreate(comm, A));
1605   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1606   PetscFunctionReturn(PETSC_SUCCESS);
1607 }
1608 
1609 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1610 
1611    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1612  */
1613 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1614 {
1615   PetscFunctionBegin;
1616   PetscCall(MatCreate(comm, A));
1617   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1618   PetscFunctionReturn(PETSC_SUCCESS);
1619 }
1620 
1621 /*@C
1622   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1623   (the default parallel PETSc format). This matrix will ultimately be handled by
1624   Kokkos for calculations.
1625 
1626   Collective
1627 
1628   Input Parameters:
1629 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1630 . m    - number of rows
1631 . n    - number of columns
1632 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1633 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1634 
1635   Output Parameter:
1636 . A - the matrix
1637 
1638   Level: intermediate
1639 
1640   Notes:
1641   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1642   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1643   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1644 
1645   The AIJ format, also called
1646   compressed row storage, is fully compatible with standard Fortran
1647   storage.  That is, the stored row and column indices can begin at
1648   either one (as in Fortran) or zero.
1649 
1650   Specify the preallocated storage with either `nz` or `nnz` (not both).
1651   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1652   allocation.
1653 
1654 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1655 @*/
1656 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1657 {
1658   PetscFunctionBegin;
1659   PetscCall(PetscKokkosInitializeCheck());
1660   PetscCall(MatCreate(comm, A));
1661   PetscCall(MatSetSizes(*A, m, n, m, n));
1662   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1663   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1664   PetscFunctionReturn(PETSC_SUCCESS);
1665 }
1666 
1667 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1668 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1669 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1670 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1671 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1672 {
1673   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1674   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1675   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1676 
1677   PetscFunctionBegin;
1678   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1679     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1680     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1681     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1682   }
1683   PetscFunctionReturn(PETSC_SUCCESS);
1684 }
1685 
1686 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1687 {
1688   const PetscInt              n         = A->rmap->n;
1689   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1690   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1691   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1692 
1693   PetscFunctionBegin;
1694   if (!factors->transpose_updated) {
1695     if (has_upper) {
1696       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1697         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1698         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1699         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1700       }
1701 
1702       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1703         if (!factors->U) {
1704           Mat_SeqAIJ *seq;
1705 
1706           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1707           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1708 
1709           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1710           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1711           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1712           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1713         } else {
1714           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1715         }
1716         // Copy Ut from host to device
1717         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1718         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1719         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1720       } else { // If U was computed on device, we also compute the transpose there
1721         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1722         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1723                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1724                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1725                                                                                                                                                                                                                                factors->aUt_d));
1726         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1727       }
1728       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1729     }
1730 
1731     // do the same for L with LU
1732     if (has_lower) {
1733       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1734         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1735         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1736         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1737       }
1738 
1739       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1740         if (!factors->L) {
1741           Mat_SeqAIJ *seq;
1742 
1743           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1744           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1745 
1746           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1747           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1748           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1749           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1750         } else {
1751           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1752         }
1753         // Copy Lt from host to device
1754         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1755         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1756         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1757       } else { // If L was computed on device, we also compute the transpose there
1758         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1759         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1760                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1761                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1762                                                                                                                                                                                                                                factors->aLt_d));
1763         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1764       }
1765       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1766     }
1767 
1768     factors->transpose_updated = PETSC_TRUE;
1769   }
1770   PetscFunctionReturn(PETSC_SUCCESS);
1771 }
1772 
1773 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1774 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1775 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1776 {
1777   auto                        exec    = PetscGetKokkosExecutionSpace();
1778   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1779   PetscInt                    m       = A->rmap->n;
1780   PetscScalarKokkosView       D       = factors->D_d;
1781   PetscScalarKokkosView       X, Y, B; // alias
1782   ConstPetscScalarKokkosView  b;
1783   PetscScalarKokkosView       x;
1784   PetscIntKokkosView         &rowperm  = factors->rowperm;
1785   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1786 
1787   PetscFunctionBegin;
1788   PetscCall(PetscLogGpuTimeBegin());
1789   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1790   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1791   PetscCall(VecGetKokkosView(bb, &b));
1792   PetscCall(VecGetKokkosViewWrite(xx, &x));
1793 
1794   // Solve U^T Y = B
1795   if (identity) { // Reorder b with the row permutation
1796     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1797     Y = factors->workVector;
1798   } else {
1799     B = factors->workVector;
1800     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1801     Y = x;
1802   }
1803   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1804 
1805   // Solve diag(D) Y' = Y.
1806   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1807   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1808 
1809   // Solve UX = Y
1810   if (identity) {
1811     X = x;
1812   } else {
1813     X = factors->workVector; // B is not needed anymore
1814   }
1815   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1816 
1817   // Reorder X with the inverse column (row) permutation
1818   if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1819 
1820   PetscCall(VecRestoreKokkosView(bb, &b));
1821   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1822   PetscCall(PetscLogGpuTimeEnd());
1823   PetscFunctionReturn(PETSC_SUCCESS);
1824 }
1825 
1826 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1827 // R and C are represented by rowperm and colperm in factors.
1828 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1829 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1830 {
1831   auto                        exec    = PetscGetKokkosExecutionSpace();
1832   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1833   PetscInt                    m       = A->rmap->n;
1834   PetscScalarKokkosView       X, Y, B; // alias
1835   ConstPetscScalarKokkosView  b;
1836   PetscScalarKokkosView       x;
1837   PetscIntKokkosView         &rowperm      = factors->rowperm;
1838   PetscIntKokkosView         &colperm      = factors->colperm;
1839   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1840   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1841 
1842   PetscFunctionBegin;
1843   PetscCall(PetscLogGpuTimeBegin());
1844   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1845   PetscCall(VecGetKokkosView(bb, &b));
1846   PetscCall(VecGetKokkosViewWrite(xx, &x));
1847 
1848   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1849   if (row_identity) {
1850     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1851     Y = factors->workVector;
1852   } else {
1853     B = factors->workVector;
1854     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1855     Y = x;
1856   }
1857   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1858 
1859   // Solve U C^- x = Y
1860   if (col_identity) {
1861     X = x;
1862   } else {
1863     X = factors->workVector;
1864   }
1865   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1866 
1867   // x = C X; Reorder X with the inverse col permutation
1868   if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1869 
1870   PetscCall(VecRestoreKokkosView(bb, &b));
1871   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1872   PetscCall(PetscLogGpuTimeEnd());
1873   PetscFunctionReturn(PETSC_SUCCESS);
1874 }
1875 
1876 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1877 // R and C are represented by rowperm and colperm in factors.
1878 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1879 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1880 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1881 {
1882   auto                        exec    = PetscGetKokkosExecutionSpace();
1883   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1884   PetscInt                    m       = A->rmap->n;
1885   PetscScalarKokkosView       X, Y, B; // alias
1886   ConstPetscScalarKokkosView  b;
1887   PetscScalarKokkosView       x;
1888   PetscIntKokkosView         &rowperm      = factors->rowperm;
1889   PetscIntKokkosView         &colperm      = factors->colperm;
1890   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1891   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1892 
1893   PetscFunctionBegin;
1894   PetscCall(PetscLogGpuTimeBegin());
1895   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1896   PetscCall(VecGetKokkosView(bb, &b));
1897   PetscCall(VecGetKokkosViewWrite(xx, &x));
1898 
1899   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1900   if (col_identity) { // Reorder b with the col permutation
1901     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1902     Y = factors->workVector;
1903   } else {
1904     B = factors->workVector;
1905     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1906     Y = x;
1907   }
1908   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1909 
1910   // Solve L^T X = Y
1911   if (row_identity) {
1912     X = x;
1913   } else {
1914     X = factors->workVector;
1915   }
1916   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1917 
1918   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1919   if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1920 
1921   PetscCall(VecRestoreKokkosView(bb, &b));
1922   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1923   PetscCall(PetscLogGpuTimeEnd());
1924   PetscFunctionReturn(PETSC_SUCCESS);
1925 }
1926 
1927 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1928 {
1929   PetscFunctionBegin;
1930   PetscCall(MatSeqAIJKokkosSyncHost(A));
1931   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1932 
1933   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1934     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1935     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1936     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1937     const MatScalar            *Ba = b->a;
1938     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1939 
1940     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1941       // Allocate memory and copy the structure
1942       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1943       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1944       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1945       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1946       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1947       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1948 
1949       PetscInt *Li = factors->iL_h.data();
1950       PetscInt *Lj = factors->jL_h.data();
1951       PetscInt *Ui = factors->iU_h.data();
1952       PetscInt *Uj = factors->jU_h.data();
1953 
1954       Li[0] = Ui[0] = 0;
1955       for (PetscInt i = 0; i < m; i++) {
1956         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1957         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1958 
1959         PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal
1960         Lj[Li[i] + llen] = i;                                   // diagonal entry of L
1961 
1962         Uj[Ui[i]] = i;                                                             // diagonal entry of U
1963         PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on  the right of the diagonal
1964 
1965         Li[i + 1] = Li[i] + llen + 1;
1966         Ui[i + 1] = Ui[i] + ulen;
1967       }
1968 
1969       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1970       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1971       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1972       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1973       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1974       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1975 
1976       // Copy row/col permutation to device
1977       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1978       PetscBool row_identity;
1979       PetscCall(ISIdentity(rowperm, &row_identity));
1980       if (!row_identity) {
1981         const PetscInt *ip;
1982 
1983         PetscCall(ISGetIndices(rowperm, &ip));
1984         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1985         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1986         PetscCall(ISRestoreIndices(rowperm, &ip));
1987         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1988       }
1989 
1990       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1991       PetscBool col_identity;
1992       PetscCall(ISIdentity(colperm, &col_identity));
1993       if (!col_identity) {
1994         const PetscInt *ip;
1995 
1996         PetscCall(ISGetIndices(colperm, &ip));
1997         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
1998         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
1999         PetscCall(ISRestoreIndices(colperm, &ip));
2000         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2001       }
2002 
2003       /* Create sptrsv handles for L, U and their transpose */
2004 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2005       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2006 #else
2007       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2008 #endif
2009       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2010       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2011       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2012       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2013     }
2014 
2015     // Copy the value
2016     for (PetscInt i = 0; i < m; i++) {
2017       PetscInt        llen = Bi[i + 1] - Bi[i];
2018       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2019       const PetscInt *Li   = factors->iL_h.data();
2020       const PetscInt *Ui   = factors->iU_h.data();
2021 
2022       PetscScalar *La = factors->aL_h.data();
2023       PetscScalar *Ua = factors->aU_h.data();
2024 
2025       PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L
2026       La[Li[i] + llen] = 1.0;                                 // diagonal entry
2027 
2028       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                            // diagonal entry
2029       PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U
2030     }
2031 
2032     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2033     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2034     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2035     factors->transpose_updated         = PETSC_FALSE;
2036     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2037 
2038     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2039     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2040   }
2041 
2042   B->ops->matsolve          = NULL;
2043   B->ops->matsolvetranspose = NULL;
2044   PetscFunctionReturn(PETSC_SUCCESS);
2045 }
2046 
2047 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2048 {
2049   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2050   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2051   PetscInt                    fill_lev = info->levels;
2052 
2053   PetscFunctionBegin;
2054   PetscCall(PetscLogGpuTimeBegin());
2055   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2056   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2057 
2058   auto a_d = aijkok->a_dual.view_device();
2059   auto i_d = aijkok->i_dual.view_device();
2060   auto j_d = aijkok->j_dual.view_device();
2061 
2062   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2063 
2064   B->assembled              = PETSC_TRUE;
2065   B->preallocated           = PETSC_TRUE;
2066   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2067   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2068   B->ops->matsolve          = NULL;
2069   B->ops->matsolvetranspose = NULL;
2070 
2071   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2072   factors->transpose_updated         = PETSC_FALSE;
2073   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2074   /* TODO: log flops, but how to know that? */
2075   PetscCall(PetscLogGpuTimeEnd());
2076   PetscFunctionReturn(PETSC_SUCCESS);
2077 }
2078 
2079 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2080 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2081 {
2082   Mat_SeqAIJKokkos           *aijkok;
2083   Mat_SeqAIJ                 *b;
2084   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2085   PetscInt                    fill_lev = info->levels;
2086   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2087   PetscInt                    n        = A->rmap->n;
2088 
2089   PetscFunctionBegin;
2090   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2091   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2092 
2093   /* Create a spiluk handle and then do symbolic factorization */
2094   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2095   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2096 
2097   auto spiluk_handle = factors->kh.get_spiluk_handle();
2098 
2099   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2100   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2101   Kokkos::realloc(factors->iU_d, n + 1);
2102   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2103 
2104   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2105   auto i_d = aijkok->i_dual.view_device();
2106   auto j_d = aijkok->j_dual.view_device();
2107   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2108   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2109 
2110   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2111   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2112   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2113   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2114 
2115   /* TODO: add options to select sptrsv algorithms */
2116   /* Create sptrsv handles for L, U and their transpose */
2117 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2118   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2119 #else
2120   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2121 #endif
2122 
2123   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2124   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2125   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2126   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2127 
2128   /* Fill fields of the factor matrix B */
2129   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2130   b     = (Mat_SeqAIJ *)B->data;
2131   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2132   B->info.fill_ratio_given  = info->fill;
2133   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2134 
2135   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2136   PetscFunctionReturn(PETSC_SUCCESS);
2137 }
2138 
2139 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2140 {
2141   PetscFunctionBegin;
2142   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2143   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2144   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2145   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2146   PetscFunctionReturn(PETSC_SUCCESS);
2147 }
2148 
2149 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2150 {
2151   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2152 
2153   PetscFunctionBegin;
2154   if (!info->factoronhost) {
2155     PetscCall(ISIdentity(isrow, &row_identity));
2156     PetscCall(ISIdentity(iscol, &col_identity));
2157   }
2158 
2159   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2160   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2161 
2162   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2163     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2164   } else {
2165     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2166     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2167   }
2168   PetscFunctionReturn(PETSC_SUCCESS);
2169 }
2170 
2171 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2172 {
2173   PetscFunctionBegin;
2174   PetscCall(MatSeqAIJKokkosSyncHost(A));
2175   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2176 
2177   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2178     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2179     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2180     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2181     const MatScalar            *Ba = b->a;
2182     PetscInt                    m  = B->rmap->n;
2183 
2184     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2185       // Allocate memory and copy the structure
2186       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2187       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2188       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2189       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2190       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2191       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2192 
2193       // Build jU_h from the skewed Aj
2194       PetscInt *Uj = factors->jU_h.data();
2195       for (PetscInt i = 0; i < m; i++) {
2196         PetscInt ulen = Bi[i + 1] - Bi[i];
2197         Uj[Bi[i]]     = i;                                              // diagonal entry
2198         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2199       }
2200 
2201       // Copy iU, jU to device
2202       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2203       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2204 
2205       // Copy row/col permutation to device
2206       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2207       PetscBool row_identity;
2208       PetscCall(ISIdentity(rowperm, &row_identity));
2209       if (!row_identity) {
2210         const PetscInt *ip;
2211 
2212         PetscCall(ISGetIndices(rowperm, &ip));
2213         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2214         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2215         PetscCall(ISRestoreIndices(rowperm, &ip));
2216         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2217       }
2218 
2219       // Create sptrsv handles for U and U^T
2220 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2221       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2222 #else
2223       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2224 #endif
2225       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2226       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2227     }
2228     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2229     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2230     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2231 
2232     // Copy the value
2233     PetscScalar *Ua = factors->aU_h.data();
2234     PetscScalar *D  = factors->D_h.data();
2235     for (PetscInt i = 0; i < m; i++) {
2236       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2237       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2238       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2239     }
2240     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2241     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2242 
2243     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2244     factors->transpose_updated         = PETSC_FALSE;
2245   }
2246 
2247   B->ops->matsolve          = NULL;
2248   B->ops->matsolvetranspose = NULL;
2249   PetscFunctionReturn(PETSC_SUCCESS);
2250 }
2251 
2252 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2253 {
2254   PetscFunctionBegin;
2255   if (info->solveonhost) {
2256     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2257     PetscCall(MatSetType(B, MATSEQSBAIJ));
2258     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2259   }
2260 
2261   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2262 
2263   if (!info->solveonhost) {
2264     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2265     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2266     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2267     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2268   }
2269   PetscFunctionReturn(PETSC_SUCCESS);
2270 }
2271 
2272 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2273 {
2274   PetscFunctionBegin;
2275   if (info->solveonhost) {
2276     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2277     PetscCall(MatSetType(B, MATSEQSBAIJ));
2278     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2279   }
2280 
2281   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2282 
2283   if (!info->solveonhost) {
2284     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2285     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2286     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2287     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2288   }
2289   PetscFunctionReturn(PETSC_SUCCESS);
2290 }
2291 
2292 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2293 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2294 {
2295   PetscFunctionBegin;
2296   *type = MATSOLVERKOKKOS;
2297   PetscFunctionReturn(PETSC_SUCCESS);
2298 }
2299 
2300 /*MC
2301   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2302   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2303 
2304   Level: beginner
2305 
2306 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`
2307 M*/
2308 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2309 {
2310   PetscInt n = A->rmap->n;
2311   MPI_Comm comm;
2312 
2313   PetscFunctionBegin;
2314   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2315   PetscCall(MatCreate(comm, B));
2316   PetscCall(MatSetSizes(*B, n, n, n, n));
2317   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2318   (*B)->factortype = ftype;
2319   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2320   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2321   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2322 
2323   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2324     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2325     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2326     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2327     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2328     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2329   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2330     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2331     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2332     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2333     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2334   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2335 
2336   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2337   (*B)->canuseordering = PETSC_TRUE;
2338   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2339   PetscFunctionReturn(PETSC_SUCCESS);
2340 }
2341 
2342 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2343 {
2344   PetscFunctionBegin;
2345   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2346   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2347   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2348   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2349   PetscFunctionReturn(PETSC_SUCCESS);
2350 }
2351 
2352 /* Utility to print out a KokkosCsrMatrix for debugging */
2353 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2354 {
2355   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2356   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2357   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2358   const PetscInt    *i  = iv.data();
2359   const PetscInt    *j  = jv.data();
2360   const PetscScalar *a  = av.data();
2361   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2362 
2363   PetscFunctionBegin;
2364   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2365   for (PetscInt k = 0; k < m; k++) {
2366     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2367     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2368     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2369   }
2370   PetscFunctionReturn(PETSC_SUCCESS);
2371 }
2372