xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 7cd49bdee00a26e142bff6e7df7c3fb9209fa795)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 #define DISABLE_CUSPARSE_DEPRECATED
20 #include <KokkosSparse_spmv.hpp>
21 
22 #include <KokkosSparse_spiluk.hpp>
23 #include <KokkosSparse_sptrsv.hpp>
24 #include <KokkosSparse_spgemm.hpp>
25 #include <KokkosSparse_spadd.hpp>
26 #include <KokkosBatched_LU_Decl.hpp>
27 #include <KokkosBatched_InverseLU_Decl.hpp>
28 
29 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
30 
31 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
32   #include <KokkosSparse_Utils.hpp>
33 using KokkosSparse::sort_crs_matrix;
34 using KokkosSparse::Impl::transpose_matrix;
35 #else
36   #include <KokkosKernels_Sorting.hpp>
37 using KokkosKernels::sort_crs_matrix;
38 using KokkosKernels::Impl::transpose_matrix;
39 #endif
40 
41 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
42 using KokkosSparse::spiluk_symbolic;
43 using KokkosSparse::spiluk_numeric;
44 using KokkosSparse::sptrsv_symbolic;
45 using KokkosSparse::sptrsv_solve;
46 using KokkosSparse::Experimental::SPTRSVAlgorithm;
47 using KokkosSparse::Experimental::SPILUKAlgorithm;
48 #else
49 using KokkosSparse::Experimental::spiluk_symbolic;
50 using KokkosSparse::Experimental::spiluk_numeric;
51 using KokkosSparse::Experimental::sptrsv_symbolic;
52 using KokkosSparse::Experimental::sptrsv_solve;
53 using KokkosSparse::Experimental::SPTRSVAlgorithm;
54 using KokkosSparse::Experimental::SPILUKAlgorithm;
55 #endif
56 
57 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
58 
59 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
60    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
61    In the latter case, it is important to set a_dual's sync state correctly.
62  */
63 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
64 {
65   Mat_SeqAIJ       *aijseq;
66   Mat_SeqAIJKokkos *aijkok;
67 
68   PetscFunctionBegin;
69   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
70   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
71 
72   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
73   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
74 
75   /* If aijkok does not exist, we just copy i, j to device.
76      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
77      In both cases, we build a new aijkok structure.
78   */
79   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
80     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
81       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
82       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
83       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
84       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
85       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
86       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
87       aijseq->free_a  = PETSC_TRUE;
88       aijseq->free_ij = PETSC_TRUE;
89       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
90          we have the CsrMatrix on device first and then copy to host, followed by
91          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
92          One could improve it by not using NULL garray.
93       */
94     }
95     delete aijkok;
96     aijkok   = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /* don't copy mat values to device */);
97     A->spptr = aijkok;
98   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
99     const PetscInt *adiag;
100     /* the a->diag is created at assmebly here because the rest of the Kokkos AIJ code assumes it always exists. This needs to be fixed since it is now only created when needed! */
101     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
102     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
103     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
104     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
105   }
106   PetscFunctionReturn(PETSC_SUCCESS);
107 }
108 
109 /* Sync CSR data to device if not yet */
110 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
111 {
112   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
113 
114   PetscFunctionBegin;
115   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
116   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
117   if (aijkok->a_dual.need_sync_device()) {
118     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
119     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
120     aijkok->hermitian_updated = PETSC_FALSE;
121   }
122   PetscFunctionReturn(PETSC_SUCCESS);
123 }
124 
125 /* Mark the CSR data on device as modified */
126 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
127 {
128   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
129 
130   PetscFunctionBegin;
131   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
132   aijkok->a_dual.clear_sync_state();
133   aijkok->a_dual.modify_device();
134   aijkok->transpose_updated = PETSC_FALSE;
135   aijkok->hermitian_updated = PETSC_FALSE;
136   PetscCall(PetscObjectStateIncrease((PetscObject)A));
137   PetscFunctionReturn(PETSC_SUCCESS);
138 }
139 
140 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
141 {
142   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
143   auto              exec   = PetscGetKokkosExecutionSpace();
144 
145   PetscFunctionBegin;
146   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
147   /* We do not expect one needs factors on host  */
148   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
149   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
150   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
151   PetscFunctionReturn(PETSC_SUCCESS);
152 }
153 
154 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
155 {
156   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
157 
158   PetscFunctionBegin;
159   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
160     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
161     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
162     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
163   */
164   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
165     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
166     *array = aijkok->a_dual.view_host().data();
167   } else { /* Happens when calling MatSetValues on a newly created matrix */
168     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
169   }
170   PetscFunctionReturn(PETSC_SUCCESS);
171 }
172 
173 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
174 {
175   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
176 
177   PetscFunctionBegin;
178   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
179   PetscFunctionReturn(PETSC_SUCCESS);
180 }
181 
182 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
183 {
184   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
185 
186   PetscFunctionBegin;
187   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
188     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
189     *array = aijkok->a_dual.view_host().data();
190   } else {
191     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
192   }
193   PetscFunctionReturn(PETSC_SUCCESS);
194 }
195 
196 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
197 {
198   PetscFunctionBegin;
199   *array = NULL;
200   PetscFunctionReturn(PETSC_SUCCESS);
201 }
202 
203 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
204 {
205   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
206 
207   PetscFunctionBegin;
208   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
209     *array = aijkok->a_dual.view_host().data();
210   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
211     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
212   }
213   PetscFunctionReturn(PETSC_SUCCESS);
214 }
215 
216 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
217 {
218   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
219 
220   PetscFunctionBegin;
221   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
222     aijkok->a_dual.clear_sync_state();
223     aijkok->a_dual.modify_host();
224   }
225   PetscFunctionReturn(PETSC_SUCCESS);
226 }
227 
228 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
229 {
230   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
231 
232   PetscFunctionBegin;
233   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
234 
235   if (i) *i = aijkok->i_device_data();
236   if (j) *j = aijkok->j_device_data();
237   if (a) {
238     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
239     *a = aijkok->a_device_data();
240   }
241   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
242   PetscFunctionReturn(PETSC_SUCCESS);
243 }
244 
245 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
246 {
247   PetscFunctionBegin;
248   *mtype = PETSC_MEMTYPE_KOKKOS;
249   PetscFunctionReturn(PETSC_SUCCESS);
250 }
251 
252 /*
253   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
254 
255   Input Parameter:
256 .  A       - the MATSEQAIJKOKKOS matrix
257 
258   Output Parameters:
259 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
260 -  T_d    - the transpose on device, whose value array is allocated but not initialized
261 */
262 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
263 {
264   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
265   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
266   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
267   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
268   MatRowMapType          *Ti = Ti_h.data();
269   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
270   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
271   PetscInt               *Tj   = Tj_h.data();
272   PetscInt               *perm = perm_h.data();
273   PetscInt               *offset;
274 
275   PetscFunctionBegin;
276   // Populate Ti
277   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
278   Ti++;
279   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
280   Ti--;
281   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
282 
283   // Populate Tj and the permutation array
284   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
285   for (PetscInt i = 0; i < m; i++) {
286     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
287       PetscInt r    = Aj[j];                       // row r of T
288       PetscInt disp = Ti[r] + offset[r];
289 
290       Tj[disp]   = i; // col i of T
291       perm[disp] = j;
292       offset[r]++;
293     }
294   }
295   PetscCall(PetscFree(offset));
296 
297   // Sort each row of T, along with the permutation array
298   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
299 
300   // Output perm and T on device
301   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
302   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
303   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
304   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
305   PetscFunctionReturn(PETSC_SUCCESS);
306 }
307 
308 // Generate the transpose on device and cache it internally
309 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
310 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
311 {
312   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
313   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
314   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
315   KokkosCsrMatrix  &T = akok->csrmatT;
316 
317   PetscFunctionBegin;
318   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
319   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
320 
321   const auto &Aa = akok->a_dual.view_device();
322 
323   if (A->symmetric == PETSC_BOOL3_TRUE) {
324     *csrmatT = akok->csrmat;
325   } else {
326     // See if we already have a cached transpose and its value is up to date
327     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
328       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
329         const auto &perm = akok->transpose_perm; // get the permutation array
330         auto       &Ta   = T.values;
331 
332         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
333       }
334     } else { // Generate T of size n x m for the first time
335       MatRowMapKokkosView perm;
336 
337       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
338       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
339       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
340     }
341     akok->transpose_updated = PETSC_TRUE;
342     *csrmatT                = akok->csrmatT;
343   }
344   PetscFunctionReturn(PETSC_SUCCESS);
345 }
346 
347 // Generate the Hermitian on device and cache it internally
348 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
349 {
350   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
351   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
352   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
353   KokkosCsrMatrix  &T = akok->csrmatH;
354 
355   PetscFunctionBegin;
356   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
357   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
358 
359   const auto &Aa = akok->a_dual.view_device();
360 
361   if (A->hermitian == PETSC_BOOL3_TRUE) {
362     *csrmatH = akok->csrmat;
363   } else {
364     // See if we already have a cached Hermitian and its value is up to date
365     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
366       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
367         const auto &perm = akok->transpose_perm; // get the permutation array
368         auto       &Ta   = T.values;
369 
370         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
371       }
372     } else { // Generate T of size n x m for the first time
373       MatRowMapKokkosView perm;
374 
375       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
376       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
377       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
378     }
379     akok->hermitian_updated = PETSC_TRUE;
380     *csrmatH                = akok->csrmatH;
381   }
382   PetscFunctionReturn(PETSC_SUCCESS);
383 }
384 
385 /* y = A x */
386 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
387 {
388   Mat_SeqAIJKokkos          *aijkok;
389   ConstPetscScalarKokkosView xv;
390   PetscScalarKokkosView      yv;
391 
392   PetscFunctionBegin;
393   PetscCall(PetscLogGpuTimeBegin());
394   PetscCall(MatSeqAIJKokkosSyncDevice(A));
395   PetscCall(VecGetKokkosView(xx, &xv));
396   PetscCall(VecGetKokkosViewWrite(yy, &yv));
397   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
398   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
399   PetscCall(VecRestoreKokkosView(xx, &xv));
400   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
401   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
402   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
403   PetscCall(PetscLogGpuTimeEnd());
404   PetscFunctionReturn(PETSC_SUCCESS);
405 }
406 
407 /* y = A^T x */
408 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
409 {
410   Mat_SeqAIJKokkos          *aijkok;
411   const char                *mode;
412   ConstPetscScalarKokkosView xv;
413   PetscScalarKokkosView      yv;
414   KokkosCsrMatrix            csrmat;
415 
416   PetscFunctionBegin;
417   PetscCall(PetscLogGpuTimeBegin());
418   PetscCall(MatSeqAIJKokkosSyncDevice(A));
419   PetscCall(VecGetKokkosView(xx, &xv));
420   PetscCall(VecGetKokkosViewWrite(yy, &yv));
421   if (A->form_explicit_transpose) {
422     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
423     mode = "N";
424   } else {
425     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
426     csrmat = aijkok->csrmat;
427     mode   = "T";
428   }
429   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
430   PetscCall(VecRestoreKokkosView(xx, &xv));
431   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
432   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
433   PetscCall(PetscLogGpuTimeEnd());
434   PetscFunctionReturn(PETSC_SUCCESS);
435 }
436 
437 /* y = A^H x */
438 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
439 {
440   Mat_SeqAIJKokkos          *aijkok;
441   const char                *mode;
442   ConstPetscScalarKokkosView xv;
443   PetscScalarKokkosView      yv;
444   KokkosCsrMatrix            csrmat;
445 
446   PetscFunctionBegin;
447   PetscCall(PetscLogGpuTimeBegin());
448   PetscCall(MatSeqAIJKokkosSyncDevice(A));
449   PetscCall(VecGetKokkosView(xx, &xv));
450   PetscCall(VecGetKokkosViewWrite(yy, &yv));
451   if (A->form_explicit_transpose) {
452     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
453     mode = "N";
454   } else {
455     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
456     csrmat = aijkok->csrmat;
457     mode   = "C";
458   }
459   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
460   PetscCall(VecRestoreKokkosView(xx, &xv));
461   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
462   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
463   PetscCall(PetscLogGpuTimeEnd());
464   PetscFunctionReturn(PETSC_SUCCESS);
465 }
466 
467 /* z = A x + y */
468 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
469 {
470   Mat_SeqAIJKokkos          *aijkok;
471   ConstPetscScalarKokkosView xv;
472   PetscScalarKokkosView      zv;
473 
474   PetscFunctionBegin;
475   PetscCall(PetscLogGpuTimeBegin());
476   PetscCall(MatSeqAIJKokkosSyncDevice(A));
477   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
478   PetscCall(VecGetKokkosView(xx, &xv));
479   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
480   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
481   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
482   PetscCall(VecRestoreKokkosView(xx, &xv));
483   PetscCall(VecRestoreKokkosView(zz, &zv));
484   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
485   PetscCall(PetscLogGpuTimeEnd());
486   PetscFunctionReturn(PETSC_SUCCESS);
487 }
488 
489 /* z = A^T x + y */
490 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
491 {
492   Mat_SeqAIJKokkos          *aijkok;
493   const char                *mode;
494   ConstPetscScalarKokkosView xv;
495   PetscScalarKokkosView      zv;
496   KokkosCsrMatrix            csrmat;
497 
498   PetscFunctionBegin;
499   PetscCall(PetscLogGpuTimeBegin());
500   PetscCall(MatSeqAIJKokkosSyncDevice(A));
501   if (zz != yy) PetscCall(VecCopy(yy, zz));
502   PetscCall(VecGetKokkosView(xx, &xv));
503   PetscCall(VecGetKokkosView(zz, &zv));
504   if (A->form_explicit_transpose) {
505     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
506     mode = "N";
507   } else {
508     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
509     csrmat = aijkok->csrmat;
510     mode   = "T";
511   }
512   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
513   PetscCall(VecRestoreKokkosView(xx, &xv));
514   PetscCall(VecRestoreKokkosView(zz, &zv));
515   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
516   PetscCall(PetscLogGpuTimeEnd());
517   PetscFunctionReturn(PETSC_SUCCESS);
518 }
519 
520 /* z = A^H x + y */
521 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
522 {
523   Mat_SeqAIJKokkos          *aijkok;
524   const char                *mode;
525   ConstPetscScalarKokkosView xv;
526   PetscScalarKokkosView      zv;
527   KokkosCsrMatrix            csrmat;
528 
529   PetscFunctionBegin;
530   PetscCall(PetscLogGpuTimeBegin());
531   PetscCall(MatSeqAIJKokkosSyncDevice(A));
532   if (zz != yy) PetscCall(VecCopy(yy, zz));
533   PetscCall(VecGetKokkosView(xx, &xv));
534   PetscCall(VecGetKokkosView(zz, &zv));
535   if (A->form_explicit_transpose) {
536     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
537     mode = "N";
538   } else {
539     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
540     csrmat = aijkok->csrmat;
541     mode   = "C";
542   }
543   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
544   PetscCall(VecRestoreKokkosView(xx, &xv));
545   PetscCall(VecRestoreKokkosView(zz, &zv));
546   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
547   PetscCall(PetscLogGpuTimeEnd());
548   PetscFunctionReturn(PETSC_SUCCESS);
549 }
550 
551 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
552 {
553   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
554 
555   PetscFunctionBegin;
556   switch (op) {
557   case MAT_FORM_EXPLICIT_TRANSPOSE:
558     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
559     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
560     A->form_explicit_transpose = flg;
561     break;
562   default:
563     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
564     break;
565   }
566   PetscFunctionReturn(PETSC_SUCCESS);
567 }
568 
569 /* Depending on reuse, either build a new mat, or use the existing mat */
570 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
571 {
572   Mat_SeqAIJ *aseq;
573 
574   PetscFunctionBegin;
575   PetscCall(PetscKokkosInitializeCheck());
576   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
577     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
578     PetscCall(MatSetType(*newmat, mtype));
579   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
580     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
581   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
582     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
583     PetscCall(PetscFree(A->defaultvectype));
584     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
585     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
586     PetscCall(MatSetOps_SeqAIJKokkos(A));
587     aseq = static_cast<Mat_SeqAIJ *>(A->data);
588     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
589       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
590       A->spptr = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
591     }
592   }
593   PetscFunctionReturn(PETSC_SUCCESS);
594 }
595 
596 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
597    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
598  */
599 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
600 {
601   Mat_SeqAIJ       *bseq;
602   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
603   Mat               mat;
604 
605   PetscFunctionBegin;
606   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
607   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
608   mat = *B;
609   if (A->assembled) {
610     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
611     bkok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
612     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
613     /* Now copy values to B if needed */
614     if (dupOption == MAT_COPY_VALUES) {
615       if (akok->a_dual.need_sync_device()) {
616         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
617         bkok->a_dual.modify_host();
618       } else { /* If device has the latest data, we only copy data on device */
619         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
620         bkok->a_dual.modify_device();
621       }
622     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
623       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
624       bkok->a_dual.modify_host();
625     }
626     mat->spptr = bkok;
627   }
628 
629   PetscCall(PetscFree(mat->defaultvectype));
630   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
631   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
632   PetscCall(MatSetOps_SeqAIJKokkos(mat));
633   PetscFunctionReturn(PETSC_SUCCESS);
634 }
635 
636 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
637 {
638   Mat               At;
639   KokkosCsrMatrix   internT;
640   Mat_SeqAIJKokkos *atkok, *bkok;
641 
642   PetscFunctionBegin;
643   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
644   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
645   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
646     /* Deep copy internT, as we want to isolate the internal transpose */
647     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
648     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
649     if (reuse == MAT_INITIAL_MATRIX) *B = At;
650     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
651   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
652     if ((*B)->assembled) {
653       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
654       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
655       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
656     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
657       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
658       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
659       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
660       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
661       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
662     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
663   }
664   PetscFunctionReturn(PETSC_SUCCESS);
665 }
666 
667 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
668 {
669   Mat_SeqAIJKokkos *aijkok;
670 
671   PetscFunctionBegin;
672   if (A->factortype == MAT_FACTOR_NONE) {
673     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
674     delete aijkok;
675   } else {
676     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
677   }
678   A->spptr = NULL;
679   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
680   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
681   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
682 #if defined(PETSC_HAVE_HYPRE)
683   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
684 #endif
685   PetscCall(MatDestroy_SeqAIJ(A));
686   PetscFunctionReturn(PETSC_SUCCESS);
687 }
688 
689 /*MC
690    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
691 
692    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
693 
694    Options Database Key:
695 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
696 
697   Level: beginner
698 
699 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
700 M*/
701 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
702 {
703   PetscFunctionBegin;
704   PetscCall(PetscKokkosInitializeCheck());
705   PetscCall(MatCreate_SeqAIJ(A));
706   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
707   PetscFunctionReturn(PETSC_SUCCESS);
708 }
709 
710 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
711 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
712 {
713   Mat_SeqAIJ         *a, *b;
714   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
715   MatScalarKokkosView aa, ba, ca;
716   MatRowMapKokkosView ai, bi, ci;
717   MatColIdxKokkosView aj, bj, cj;
718   PetscInt            m, n, nnz, aN;
719 
720   PetscFunctionBegin;
721   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
722   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
723   PetscAssertPointer(C, 4);
724   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
725   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
726   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
727   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
728 
729   PetscCall(MatSeqAIJKokkosSyncDevice(A));
730   PetscCall(MatSeqAIJKokkosSyncDevice(B));
731   a    = static_cast<Mat_SeqAIJ *>(A->data);
732   b    = static_cast<Mat_SeqAIJ *>(B->data);
733   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
734   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
735   aa   = akok->a_dual.view_device();
736   ai   = akok->i_dual.view_device();
737   ba   = bkok->a_dual.view_device();
738   bi   = bkok->i_dual.view_device();
739   m    = A->rmap->n; /* M, N and nnz of C */
740   n    = A->cmap->n + B->cmap->n;
741   nnz  = a->nz + b->nz;
742   aN   = A->cmap->n; /* N of A */
743   if (reuse == MAT_INITIAL_MATRIX) {
744     aj      = akok->j_dual.view_device();
745     bj      = bkok->j_dual.view_device();
746     auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
747     auto ci = MatRowMapKokkosView("i", ai.extent(0));
748     auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
749 
750     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
751     Kokkos::parallel_for(
752       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
753         PetscInt i       = t.league_rank(); /* row i */
754         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
755 
756         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
757                                                    ci(i) = coffset;
758                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
759         });
760 
761         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
762           if (k < alen) {
763             ca(coffset + k) = aa(ai(i) + k);
764             cj(coffset + k) = aj(ai(i) + k);
765           } else {
766             ca(coffset + k) = ba(bi(i) + k - alen);
767             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
768           }
769         });
770       });
771     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
772     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
773   } else if (reuse == MAT_REUSE_MATRIX) {
774     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
775     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
776     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
777     ca   = ckok->a_dual.view_device();
778     ci   = ckok->i_dual.view_device();
779 
780     Kokkos::parallel_for(
781       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
782         PetscInt i    = t.league_rank(); /* row i */
783         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
784         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
785           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
786           else ca(ci(i) + k) = ba(bi(i) + k - alen);
787         });
788       });
789     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
790   }
791   PetscFunctionReturn(PETSC_SUCCESS);
792 }
793 
794 static PetscErrorCode MatProductCtxDestroy_SeqAIJKokkos(void **pdata)
795 {
796   PetscFunctionBegin;
797   delete *reinterpret_cast<MatProductCtx_SeqAIJKokkos **>(pdata);
798   PetscFunctionReturn(PETSC_SUCCESS);
799 }
800 
801 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
802 {
803   Mat_Product                *product = C->product;
804   Mat                         A, B;
805   bool                        transA, transB; /* use bool, since KK needs this type */
806   Mat_SeqAIJKokkos           *akok, *bkok, *ckok;
807   Mat_SeqAIJ                 *c;
808   MatProductCtx_SeqAIJKokkos *pdata;
809   KokkosCsrMatrix             csrmatA, csrmatB;
810 
811   PetscFunctionBegin;
812   MatCheckProduct(C, 1);
813   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
814   pdata = static_cast<MatProductCtx_SeqAIJKokkos *>(C->product->data);
815 
816   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
817   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
818   // we still do numeric.
819   if (pdata->reusesym) { // numeric reuses results from symbolic
820     pdata->reusesym = PETSC_FALSE;
821     PetscFunctionReturn(PETSC_SUCCESS);
822   }
823 
824   switch (product->type) {
825   case MATPRODUCT_AB:
826     transA = false;
827     transB = false;
828     break;
829   case MATPRODUCT_AtB:
830     transA = true;
831     transB = false;
832     break;
833   case MATPRODUCT_ABt:
834     transA = false;
835     transB = true;
836     break;
837   default:
838     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
839   }
840 
841   A = product->A;
842   B = product->B;
843   PetscCall(MatSeqAIJKokkosSyncDevice(A));
844   PetscCall(MatSeqAIJKokkosSyncDevice(B));
845   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
846   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
847   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
848 
849   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
850 
851   csrmatA = akok->csrmat;
852   csrmatB = bkok->csrmat;
853 
854   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
855   if (transA) {
856     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
857     transA = false;
858   }
859 
860   if (transB) {
861     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
862     transB = false;
863   }
864   PetscCall(PetscLogGpuTimeBegin());
865   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
866 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
867   auto spgemmHandle = pdata->kh.get_spgemm_handle();
868   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
869 #endif
870 
871   PetscCall(PetscLogGpuTimeEnd());
872   PetscCall(MatSeqAIJKokkosModifyDevice(C));
873   /* shorter version of MatAssemblyEnd_SeqAIJ */
874   c = (Mat_SeqAIJ *)C->data;
875   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
876   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
877   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
878   c->reallocs         = 0;
879   C->info.mallocs     = 0;
880   C->info.nz_unneeded = 0;
881   C->assembled = C->was_assembled = PETSC_TRUE;
882   C->num_ass++;
883   PetscFunctionReturn(PETSC_SUCCESS);
884 }
885 
886 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
887 {
888   Mat_Product                *product = C->product;
889   MatProductType              ptype;
890   Mat                         A, B;
891   bool                        transA, transB;
892   Mat_SeqAIJKokkos           *akok, *bkok, *ckok;
893   MatProductCtx_SeqAIJKokkos *pdata;
894   MPI_Comm                    comm;
895   KokkosCsrMatrix             csrmatA, csrmatB, csrmatC;
896 
897   PetscFunctionBegin;
898   MatCheckProduct(C, 1);
899   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
900   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
901   A = product->A;
902   B = product->B;
903   PetscCall(MatSeqAIJKokkosSyncDevice(A));
904   PetscCall(MatSeqAIJKokkosSyncDevice(B));
905   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
906   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
907   csrmatA = akok->csrmat;
908   csrmatB = bkok->csrmat;
909 
910   ptype = product->type;
911   // Take advantage of the symmetry if true
912   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
913     ptype                                          = MATPRODUCT_AB;
914     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
915   }
916   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
917     ptype                                          = MATPRODUCT_AB;
918     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
919   }
920 
921   switch (ptype) {
922   case MATPRODUCT_AB:
923     transA = false;
924     transB = false;
925     PetscCall(MatSetBlockSizesFromMats(C, A, B));
926     break;
927   case MATPRODUCT_AtB:
928     transA = true;
929     transB = false;
930     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
931     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
932     break;
933   case MATPRODUCT_ABt:
934     transA = false;
935     transB = true;
936     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
937     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
938     break;
939   default:
940     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
941   }
942   PetscCallCXX(product->data = pdata = new MatProductCtx_SeqAIJKokkos());
943   pdata->reusesym = product->api_user;
944 
945   /* TODO: add command line options to select spgemm algorithms */
946   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
947 
948   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
949 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
950   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
951   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
952   #endif
953 #endif
954   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
955 
956   PetscCall(PetscLogGpuTimeBegin());
957   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
958   if (transA) {
959     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
960     transA = false;
961   }
962 
963   if (transB) {
964     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
965     transB = false;
966   }
967 
968   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
969   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
970     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
971     calling new Mat_SeqAIJKokkos().
972     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
973   */
974   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
975 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
976   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
977   auto spgemmHandle = pdata->kh.get_spgemm_handle();
978   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
979 #endif
980   PetscCall(PetscLogGpuTimeEnd());
981 
982   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
983   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
984   C->product->destroy = MatProductCtxDestroy_SeqAIJKokkos;
985   PetscFunctionReturn(PETSC_SUCCESS);
986 }
987 
988 /* handles sparse matrix matrix ops */
989 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
990 {
991   Mat_Product *product = mat->product;
992   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
993 
994   PetscFunctionBegin;
995   MatCheckProduct(mat, 1);
996   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
997   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
998   if (Biskok && Ciskok) {
999     switch (product->type) {
1000     case MATPRODUCT_AB:
1001     case MATPRODUCT_AtB:
1002     case MATPRODUCT_ABt:
1003       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1004       break;
1005     case MATPRODUCT_PtAP:
1006     case MATPRODUCT_RARt:
1007     case MATPRODUCT_ABC:
1008       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1009       break;
1010     default:
1011       break;
1012     }
1013   } else { /* fallback for AIJ */
1014     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1015   }
1016   PetscFunctionReturn(PETSC_SUCCESS);
1017 }
1018 
1019 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1020 {
1021   Mat_SeqAIJKokkos *aijkok;
1022 
1023   PetscFunctionBegin;
1024   PetscCall(PetscLogGpuTimeBegin());
1025   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1026   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1027   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1028   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1029   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1030   PetscCall(PetscLogGpuTimeEnd());
1031   PetscFunctionReturn(PETSC_SUCCESS);
1032 }
1033 
1034 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1035 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1036 {
1037   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1038 
1039   PetscFunctionBegin;
1040   if (A->assembled && aijseq->diagDense) { // no missing diagonals
1041     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1042 
1043     PetscCall(PetscLogGpuTimeBegin());
1044     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1045     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1046     const auto &Aa     = aijkok->a_dual.view_device();
1047     const auto &Adiag  = aijkok->diag_dual.view_device();
1048     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1049     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1050     PetscCall(PetscLogGpuFlops(n));
1051     PetscCall(PetscLogGpuTimeEnd());
1052   } else { // need reassembly, very slow!
1053     PetscCall(MatShift_Basic(A, a));
1054   }
1055   PetscFunctionReturn(PETSC_SUCCESS);
1056 }
1057 
1058 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1059 {
1060   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1061 
1062   PetscFunctionBegin;
1063   if (Y->assembled && aijseq->diagDense) { // no missing diagonals
1064     ConstPetscScalarKokkosView dv;
1065     PetscInt                   n, nv;
1066 
1067     PetscCall(PetscLogGpuTimeBegin());
1068     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1069     PetscCall(VecGetKokkosView(D, &dv));
1070     PetscCall(VecGetLocalSize(D, &nv));
1071     n = PetscMin(Y->rmap->n, Y->cmap->n);
1072     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1073 
1074     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1075     const auto &Aa     = aijkok->a_dual.view_device();
1076     const auto &Adiag  = aijkok->diag_dual.view_device();
1077     PetscCallCXX(Kokkos::parallel_for(
1078       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1079         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1080         else Aa(Adiag(i)) += dv(i);
1081       }));
1082     PetscCall(VecRestoreKokkosView(D, &dv));
1083     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1084     PetscCall(PetscLogGpuFlops(n));
1085     PetscCall(PetscLogGpuTimeEnd());
1086   } else { // need reassembly, very slow!
1087     PetscCall(MatDiagonalSet_Default(Y, D, is));
1088   }
1089   PetscFunctionReturn(PETSC_SUCCESS);
1090 }
1091 
1092 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1093 {
1094   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1095   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1096   ConstPetscScalarKokkosView lv, rv;
1097 
1098   PetscFunctionBegin;
1099   PetscCall(PetscLogGpuTimeBegin());
1100   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1101   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1102   const auto &Aa     = aijkok->a_dual.view_device();
1103   const auto &Ai     = aijkok->i_dual.view_device();
1104   const auto &Aj     = aijkok->j_dual.view_device();
1105   if (ll) {
1106     PetscCall(VecGetLocalSize(ll, &m));
1107     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1108     PetscCall(VecGetKokkosView(ll, &lv));
1109     PetscCallCXX(Kokkos::parallel_for( // for each row
1110       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1111         PetscInt i   = t.league_rank(); // row i
1112         PetscInt len = Ai(i + 1) - Ai(i);
1113         // scale entries on the row
1114         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1115       }));
1116     PetscCall(VecRestoreKokkosView(ll, &lv));
1117     PetscCall(PetscLogGpuFlops(nz));
1118   }
1119   if (rr) {
1120     PetscCall(VecGetLocalSize(rr, &n));
1121     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1122     PetscCall(VecGetKokkosView(rr, &rv));
1123     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1124       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1125     PetscCall(VecRestoreKokkosView(rr, &lv));
1126     PetscCall(PetscLogGpuFlops(nz));
1127   }
1128   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1129   PetscCall(PetscLogGpuTimeEnd());
1130   PetscFunctionReturn(PETSC_SUCCESS);
1131 }
1132 
1133 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1134 {
1135   Mat_SeqAIJKokkos *aijkok;
1136 
1137   PetscFunctionBegin;
1138   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1139   if (aijkok) { /* Only zero the device if data is already there */
1140     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1141     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1142   } else { /* Might be preallocated but not assembled */
1143     PetscCall(MatZeroEntries_SeqAIJ(A));
1144   }
1145   PetscFunctionReturn(PETSC_SUCCESS);
1146 }
1147 
1148 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1149 {
1150   Mat_SeqAIJKokkos     *aijkok;
1151   PetscInt              n;
1152   PetscScalarKokkosView xv;
1153 
1154   PetscFunctionBegin;
1155   PetscCall(VecGetLocalSize(x, &n));
1156   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1157   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1158 
1159   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1160   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1161 
1162   const auto &Aa    = aijkok->a_dual.view_device();
1163   const auto &Ai    = aijkok->i_dual.view_device();
1164   const auto &Adiag = aijkok->diag_dual.view_device();
1165 
1166   PetscCall(VecGetKokkosViewWrite(x, &xv));
1167   Kokkos::parallel_for(
1168     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1169       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1170       else xv(i) = 0;
1171     });
1172   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1173   PetscFunctionReturn(PETSC_SUCCESS);
1174 }
1175 
1176 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1177 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1178 {
1179   Mat_SeqAIJKokkos *aijkok;
1180 
1181   PetscFunctionBegin;
1182   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1183   PetscAssertPointer(kv, 2);
1184   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1185   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1186   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1187   *kv    = aijkok->a_dual.view_device();
1188   PetscFunctionReturn(PETSC_SUCCESS);
1189 }
1190 
1191 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1192 {
1193   PetscFunctionBegin;
1194   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1195   PetscAssertPointer(kv, 2);
1196   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1197   PetscFunctionReturn(PETSC_SUCCESS);
1198 }
1199 
1200 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1201 {
1202   Mat_SeqAIJKokkos *aijkok;
1203 
1204   PetscFunctionBegin;
1205   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1206   PetscAssertPointer(kv, 2);
1207   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1208   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1209   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1210   *kv    = aijkok->a_dual.view_device();
1211   PetscFunctionReturn(PETSC_SUCCESS);
1212 }
1213 
1214 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1215 {
1216   PetscFunctionBegin;
1217   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1218   PetscAssertPointer(kv, 2);
1219   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1220   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1221   PetscFunctionReturn(PETSC_SUCCESS);
1222 }
1223 
1224 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1225 {
1226   Mat_SeqAIJKokkos *aijkok;
1227 
1228   PetscFunctionBegin;
1229   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1230   PetscAssertPointer(kv, 2);
1231   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1232   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1233   *kv    = aijkok->a_dual.view_device();
1234   PetscFunctionReturn(PETSC_SUCCESS);
1235 }
1236 
1237 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1238 {
1239   PetscFunctionBegin;
1240   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1241   PetscAssertPointer(kv, 2);
1242   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1243   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1244   PetscFunctionReturn(PETSC_SUCCESS);
1245 }
1246 
1247 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1248 {
1249   Mat_SeqAIJKokkos *akok;
1250 
1251   PetscFunctionBegin;
1252   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1253   PetscCall(MatCreate(comm, A));
1254   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1255   PetscFunctionReturn(PETSC_SUCCESS);
1256 }
1257 
1258 /* Computes Y += alpha X */
1259 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1260 {
1261   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1262   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1263   ConstMatScalarKokkosView Xa;
1264   MatScalarKokkosView      Ya;
1265   auto                     exec = PetscGetKokkosExecutionSpace();
1266 
1267   PetscFunctionBegin;
1268   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1269   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1270   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1271   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1272   PetscCall(PetscLogGpuTimeBegin());
1273 
1274   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1275     PetscBool e;
1276     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1277     if (e) {
1278       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1279       if (e) pattern = SAME_NONZERO_PATTERN;
1280     }
1281   }
1282 
1283   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1284     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1285     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1286     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1287   */
1288   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1289   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1290   Xa   = xkok->a_dual.view_device();
1291   Ya   = ykok->a_dual.view_device();
1292 
1293   if (pattern == SAME_NONZERO_PATTERN) {
1294     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1295     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1296   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1297     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1298     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1299 
1300     Kokkos::parallel_for(
1301       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1302         PetscInt i = t.league_rank(); // row i
1303         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1304           // Only one thread works in a team
1305           PetscInt p, q = Yi(i);
1306           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1307             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1308             if (Xj(p) == Yj(q)) {                        // Found it
1309               Ya(q) += alpha * Xa(p);
1310               q++;
1311             } else {
1312             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1313             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1314 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1315               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1316 #else
1317               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1318 #endif
1319             }
1320           }
1321         });
1322       });
1323     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1324   } else { // different nonzero patterns
1325     Mat             Z;
1326     KokkosCsrMatrix zcsr;
1327     KernelHandle    kh;
1328     kh.create_spadd_handle(true); // X, Y are sorted
1329     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1330     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1331     zkok = new Mat_SeqAIJKokkos(zcsr);
1332     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1333     PetscCall(MatHeaderReplace(Y, &Z));
1334     kh.destroy_spadd_handle();
1335   }
1336   PetscCall(PetscLogGpuTimeEnd());
1337   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1338   PetscFunctionReturn(PETSC_SUCCESS);
1339 }
1340 
1341 struct MatCOOStruct_SeqAIJKokkos {
1342   PetscCount           n;
1343   PetscCount           Atot;
1344   PetscInt             nz;
1345   PetscCountKokkosView jmap;
1346   PetscCountKokkosView perm;
1347 
1348   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1349   {
1350     nz   = coo_h->nz;
1351     n    = coo_h->n;
1352     Atot = coo_h->Atot;
1353     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1354     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1355   }
1356 };
1357 
1358 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1359 {
1360   PetscFunctionBegin;
1361   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1362   PetscFunctionReturn(PETSC_SUCCESS);
1363 }
1364 
1365 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1366 {
1367   Mat_SeqAIJKokkos          *akok;
1368   Mat_SeqAIJ                *aseq;
1369   PetscContainer             container_h;
1370   MatCOOStruct_SeqAIJ       *coo_h;
1371   MatCOOStruct_SeqAIJKokkos *coo_d;
1372 
1373   PetscFunctionBegin;
1374   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1375   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1376   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1377   delete akok;
1378   mat->spptr = akok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1379   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1380 
1381   // Copy the COO struct to device
1382   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1383   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1384   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1385 
1386   // Put the COO struct in a container and then attach that to the matrix
1387   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1388   PetscFunctionReturn(PETSC_SUCCESS);
1389 }
1390 
1391 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1392 {
1393   MatScalarKokkosView        Aa;
1394   ConstMatScalarKokkosView   kv;
1395   PetscMemType               memtype;
1396   PetscContainer             container;
1397   MatCOOStruct_SeqAIJKokkos *coo;
1398 
1399   PetscFunctionBegin;
1400   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1401   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1402 
1403   const auto &n    = coo->n;
1404   const auto &Annz = coo->nz;
1405   const auto &jmap = coo->jmap;
1406   const auto &perm = coo->perm;
1407 
1408   PetscCall(PetscGetMemType(v, &memtype));
1409   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1410     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1411   } else {
1412     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1413   }
1414 
1415   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1416   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1417 
1418   PetscCall(PetscLogGpuTimeBegin());
1419   Kokkos::parallel_for(
1420     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1421       PetscScalar sum = 0.0;
1422       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1423       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1424     });
1425   PetscCall(PetscLogGpuTimeEnd());
1426 
1427   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1428   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1429   PetscFunctionReturn(PETSC_SUCCESS);
1430 }
1431 
1432 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1433 {
1434   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1435 
1436   PetscFunctionBegin;
1437   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1438   A->boundtocpu  = PETSC_FALSE;
1439 
1440   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1441   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1442   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1443   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1444   A->ops->scale                     = MatScale_SeqAIJKokkos;
1445   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1446   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1447   A->ops->mult                      = MatMult_SeqAIJKokkos;
1448   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1449   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1450   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1451   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1452   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1453   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1454   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1455   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1456   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1457   A->ops->shift                     = MatShift_SeqAIJKokkos;
1458   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1459   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1460   A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJKokkos;
1461   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1462   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1463   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1464   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1465   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1466   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1467   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1468 
1469   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1470   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1471 #if defined(PETSC_HAVE_HYPRE)
1472   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1473 #endif
1474   PetscFunctionReturn(PETSC_SUCCESS);
1475 }
1476 
1477 /*
1478    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1479 
1480   Input Parameters:
1481 +  A       - the MATSEQAIJKOKKOS matrix
1482 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1483 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1484 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1485 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1486 
1487   Output Parameter:
1488 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1489 */
1490 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1491 {
1492   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1493   PetscInt          nblocks = bs.extent(0) - 1;
1494 
1495   PetscFunctionBegin;
1496   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1497 
1498   // Pull out the diagonal blocks of the matrix and then invert the blocks
1499   auto Aa    = akok->a_dual.view_device();
1500   auto Ai    = akok->i_dual.view_device();
1501   auto Aj    = akok->j_dual.view_device();
1502   auto Adiag = akok->diag_dual.view_device();
1503   // TODO: how to tune the team size?
1504 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1505   auto ts = Kokkos::AUTO();
1506 #else
1507   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1508 #endif
1509   PetscCallCXX(Kokkos::parallel_for(
1510     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1511       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1512       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1513       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1514       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1515       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1516 
1517       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1518         PetscInt i = rstart + r;                                                            // i-th row in A
1519 
1520         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1521           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1522 
1523           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1524             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1525               B(r, c) = 0.0;
1526             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1527               B(r, c) = Aa(first + c);
1528             } else { // this entry does not show up in the CSR
1529               B(r, c) = 0.0;
1530             }
1531           }
1532         } else { // rare case that the diagonal does not exist
1533           const PetscInt begin = Ai(i);
1534           const PetscInt end   = Ai(i + 1);
1535           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1536           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1537             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1538             else if (Aj(j) >= rstart + m) break;
1539           }
1540         }
1541       });
1542 
1543       // LU-decompose B (w/o pivoting) and then invert B
1544       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1545       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1546     }));
1547   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1548   PetscFunctionReturn(PETSC_SUCCESS);
1549 }
1550 
1551 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1552 {
1553   Mat_SeqAIJ *aseq;
1554   PetscInt    i, m, n;
1555   auto        exec = PetscGetKokkosExecutionSpace();
1556 
1557   PetscFunctionBegin;
1558   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1559 
1560   m = akok->nrows();
1561   n = akok->ncols();
1562   PetscCall(MatSetSizes(A, m, n, m, n));
1563   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1564 
1565   /* Set up data structures of A as a MATSEQAIJ */
1566   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1567   aseq = (Mat_SeqAIJ *)A->data;
1568 
1569   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1570   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1571 
1572   aseq->i       = akok->i_host_data();
1573   aseq->j       = akok->j_host_data();
1574   aseq->a       = akok->a_host_data();
1575   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1576   aseq->free_a  = PETSC_FALSE;
1577   aseq->free_ij = PETSC_FALSE;
1578   aseq->nz      = akok->nnz();
1579   aseq->maxnz   = aseq->nz;
1580 
1581   PetscCall(PetscMalloc1(m, &aseq->imax));
1582   PetscCall(PetscMalloc1(m, &aseq->ilen));
1583   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1584 
1585   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1586   akok->nonzerostate = A->nonzerostate;
1587   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1588   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1589   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1590   PetscFunctionReturn(PETSC_SUCCESS);
1591 }
1592 
1593 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1594 {
1595   PetscFunctionBegin;
1596   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1597   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1598   PetscFunctionReturn(PETSC_SUCCESS);
1599 }
1600 
1601 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1602 {
1603   Mat_SeqAIJKokkos *akok;
1604 
1605   PetscFunctionBegin;
1606   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1607   PetscCall(MatCreate(comm, A));
1608   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1609   PetscFunctionReturn(PETSC_SUCCESS);
1610 }
1611 
1612 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1613 
1614    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1615  */
1616 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1617 {
1618   PetscFunctionBegin;
1619   PetscCall(MatCreate(comm, A));
1620   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1621   PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623 
1624 /*@C
1625   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1626   (the default parallel PETSc format). This matrix will ultimately be handled by
1627   Kokkos for calculations.
1628 
1629   Collective
1630 
1631   Input Parameters:
1632 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1633 . m    - number of rows
1634 . n    - number of columns
1635 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1636 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1637 
1638   Output Parameter:
1639 . A - the matrix
1640 
1641   Level: intermediate
1642 
1643   Notes:
1644   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1645   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1646   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1647 
1648   The AIJ format, also called
1649   compressed row storage, is fully compatible with standard Fortran
1650   storage.  That is, the stored row and column indices can begin at
1651   either one (as in Fortran) or zero.
1652 
1653   Specify the preallocated storage with either `nz` or `nnz` (not both).
1654   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1655   allocation.
1656 
1657 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1658 @*/
1659 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1660 {
1661   PetscFunctionBegin;
1662   PetscCall(PetscKokkosInitializeCheck());
1663   PetscCall(MatCreate(comm, A));
1664   PetscCall(MatSetSizes(*A, m, n, m, n));
1665   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1666   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1667   PetscFunctionReturn(PETSC_SUCCESS);
1668 }
1669 
1670 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1671 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1672 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1673 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1674 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1675 {
1676   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1677   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1678   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1679 
1680   PetscFunctionBegin;
1681   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1682     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1683     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1684     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1685   }
1686   PetscFunctionReturn(PETSC_SUCCESS);
1687 }
1688 
1689 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1690 {
1691   const PetscInt              n         = A->rmap->n;
1692   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1693   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1694   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1695 
1696   PetscFunctionBegin;
1697   if (!factors->transpose_updated) {
1698     if (has_upper) {
1699       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1700         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1701         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1702         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1703       }
1704 
1705       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1706         if (!factors->U) {
1707           Mat_SeqAIJ *seq;
1708 
1709           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1710           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1711 
1712           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1713           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1714           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1715           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1716         } else {
1717           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1718         }
1719         // Copy Ut from host to device
1720         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1721         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1722         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1723       } else { // If U was computed on device, we also compute the transpose there
1724         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1725         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1726                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1727                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1728                                                                                                                                                                                                                                factors->aUt_d));
1729         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1730       }
1731       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1732     }
1733 
1734     // do the same for L with LU
1735     if (has_lower) {
1736       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1737         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1738         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1739         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1740       }
1741 
1742       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1743         if (!factors->L) {
1744           Mat_SeqAIJ *seq;
1745 
1746           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1747           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1748 
1749           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1750           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1751           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1752           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1753         } else {
1754           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1755         }
1756         // Copy Lt from host to device
1757         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1758         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1759         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1760       } else { // If L was computed on device, we also compute the transpose there
1761         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1762         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1763                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1764                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1765                                                                                                                                                                                                                                factors->aLt_d));
1766         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1767       }
1768       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1769     }
1770 
1771     factors->transpose_updated = PETSC_TRUE;
1772   }
1773   PetscFunctionReturn(PETSC_SUCCESS);
1774 }
1775 
1776 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1777 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1778 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1779 {
1780   auto                        exec    = PetscGetKokkosExecutionSpace();
1781   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1782   PetscInt                    m       = A->rmap->n;
1783   PetscScalarKokkosView       D       = factors->D_d;
1784   PetscScalarKokkosView       X, Y, B; // alias
1785   ConstPetscScalarKokkosView  b;
1786   PetscScalarKokkosView       x;
1787   PetscIntKokkosView         &rowperm  = factors->rowperm;
1788   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1789 
1790   PetscFunctionBegin;
1791   PetscCall(PetscLogGpuTimeBegin());
1792   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1793   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1794   PetscCall(VecGetKokkosView(bb, &b));
1795   PetscCall(VecGetKokkosViewWrite(xx, &x));
1796 
1797   // Solve U^T Y = B
1798   if (identity) { // Reorder b with the row permutation
1799     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1800     Y = factors->workVector;
1801   } else {
1802     B = factors->workVector;
1803     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1804     Y = x;
1805   }
1806   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1807 
1808   // Solve diag(D) Y' = Y.
1809   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1810   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1811 
1812   // Solve UX = Y
1813   if (identity) {
1814     X = x;
1815   } else {
1816     X = factors->workVector; // B is not needed anymore
1817   }
1818   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1819 
1820   // Reorder X with the inverse column (row) permutation
1821   if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1822 
1823   PetscCall(VecRestoreKokkosView(bb, &b));
1824   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1825   PetscCall(PetscLogGpuTimeEnd());
1826   PetscFunctionReturn(PETSC_SUCCESS);
1827 }
1828 
1829 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1830 // R and C are represented by rowperm and colperm in factors.
1831 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1832 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1833 {
1834   auto                        exec    = PetscGetKokkosExecutionSpace();
1835   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1836   PetscInt                    m       = A->rmap->n;
1837   PetscScalarKokkosView       X, Y, B; // alias
1838   ConstPetscScalarKokkosView  b;
1839   PetscScalarKokkosView       x;
1840   PetscIntKokkosView         &rowperm      = factors->rowperm;
1841   PetscIntKokkosView         &colperm      = factors->colperm;
1842   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1843   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1844 
1845   PetscFunctionBegin;
1846   PetscCall(PetscLogGpuTimeBegin());
1847   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1848   PetscCall(VecGetKokkosView(bb, &b));
1849   PetscCall(VecGetKokkosViewWrite(xx, &x));
1850 
1851   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1852   if (row_identity) {
1853     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1854     Y = factors->workVector;
1855   } else {
1856     B = factors->workVector;
1857     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1858     Y = x;
1859   }
1860   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1861 
1862   // Solve U C^- x = Y
1863   if (col_identity) {
1864     X = x;
1865   } else {
1866     X = factors->workVector;
1867   }
1868   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1869 
1870   // x = C X; Reorder X with the inverse col permutation
1871   if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1872 
1873   PetscCall(VecRestoreKokkosView(bb, &b));
1874   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1875   PetscCall(PetscLogGpuTimeEnd());
1876   PetscFunctionReturn(PETSC_SUCCESS);
1877 }
1878 
1879 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1880 // R and C are represented by rowperm and colperm in factors.
1881 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1882 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1883 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1884 {
1885   auto                        exec    = PetscGetKokkosExecutionSpace();
1886   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1887   PetscInt                    m       = A->rmap->n;
1888   PetscScalarKokkosView       X, Y, B; // alias
1889   ConstPetscScalarKokkosView  b;
1890   PetscScalarKokkosView       x;
1891   PetscIntKokkosView         &rowperm      = factors->rowperm;
1892   PetscIntKokkosView         &colperm      = factors->colperm;
1893   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1894   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1895 
1896   PetscFunctionBegin;
1897   PetscCall(PetscLogGpuTimeBegin());
1898   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1899   PetscCall(VecGetKokkosView(bb, &b));
1900   PetscCall(VecGetKokkosViewWrite(xx, &x));
1901 
1902   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1903   if (col_identity) { // Reorder b with the col permutation
1904     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1905     Y = factors->workVector;
1906   } else {
1907     B = factors->workVector;
1908     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1909     Y = x;
1910   }
1911   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1912 
1913   // Solve L^T X = Y
1914   if (row_identity) {
1915     X = x;
1916   } else {
1917     X = factors->workVector;
1918   }
1919   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1920 
1921   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1922   if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1923 
1924   PetscCall(VecRestoreKokkosView(bb, &b));
1925   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1926   PetscCall(PetscLogGpuTimeEnd());
1927   PetscFunctionReturn(PETSC_SUCCESS);
1928 }
1929 
1930 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1931 {
1932   PetscFunctionBegin;
1933   PetscCall(MatSeqAIJKokkosSyncHost(A));
1934   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1935 
1936   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1937     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1938     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1939     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1940     const MatScalar            *Ba = b->a;
1941     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1942 
1943     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1944       // Allocate memory and copy the structure
1945       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1946       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1947       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1948       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1949       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1950       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1951 
1952       PetscInt *Li = factors->iL_h.data();
1953       PetscInt *Lj = factors->jL_h.data();
1954       PetscInt *Ui = factors->iU_h.data();
1955       PetscInt *Uj = factors->jU_h.data();
1956 
1957       Li[0] = Ui[0] = 0;
1958       for (PetscInt i = 0; i < m; i++) {
1959         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1960         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1961 
1962         PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal
1963         Lj[Li[i] + llen] = i;                                   // diagonal entry of L
1964 
1965         Uj[Ui[i]] = i;                                                             // diagonal entry of U
1966         PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on  the right of the diagonal
1967 
1968         Li[i + 1] = Li[i] + llen + 1;
1969         Ui[i + 1] = Ui[i] + ulen;
1970       }
1971 
1972       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1973       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1974       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1975       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1976       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1977       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1978 
1979       // Copy row/col permutation to device
1980       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1981       PetscBool row_identity;
1982       PetscCall(ISIdentity(rowperm, &row_identity));
1983       if (!row_identity) {
1984         const PetscInt *ip;
1985 
1986         PetscCall(ISGetIndices(rowperm, &ip));
1987         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1988         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1989         PetscCall(ISRestoreIndices(rowperm, &ip));
1990         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1991       }
1992 
1993       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1994       PetscBool col_identity;
1995       PetscCall(ISIdentity(colperm, &col_identity));
1996       if (!col_identity) {
1997         const PetscInt *ip;
1998 
1999         PetscCall(ISGetIndices(colperm, &ip));
2000         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2001         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2002         PetscCall(ISRestoreIndices(colperm, &ip));
2003         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2004       }
2005 
2006       /* Create sptrsv handles for L, U and their transpose */
2007 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2008       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2009 #else
2010       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2011 #endif
2012       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2013       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2014       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2015       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2016     }
2017 
2018     // Copy the value
2019     for (PetscInt i = 0; i < m; i++) {
2020       PetscInt        llen = Bi[i + 1] - Bi[i];
2021       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2022       const PetscInt *Li   = factors->iL_h.data();
2023       const PetscInt *Ui   = factors->iU_h.data();
2024 
2025       PetscScalar *La = factors->aL_h.data();
2026       PetscScalar *Ua = factors->aU_h.data();
2027 
2028       PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L
2029       La[Li[i] + llen] = 1.0;                                 // diagonal entry
2030 
2031       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                            // diagonal entry
2032       PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U
2033     }
2034 
2035     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2036     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2037     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2038     factors->transpose_updated         = PETSC_FALSE;
2039     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2040 
2041     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2042     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2043   }
2044 
2045   B->ops->matsolve          = NULL;
2046   B->ops->matsolvetranspose = NULL;
2047   PetscFunctionReturn(PETSC_SUCCESS);
2048 }
2049 
2050 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2051 {
2052   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2053   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2054   PetscInt                    fill_lev = info->levels;
2055 
2056   PetscFunctionBegin;
2057   PetscCall(PetscLogGpuTimeBegin());
2058   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2059   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2060 
2061   auto a_d = aijkok->a_dual.view_device();
2062   auto i_d = aijkok->i_dual.view_device();
2063   auto j_d = aijkok->j_dual.view_device();
2064 
2065   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2066 
2067   B->assembled              = PETSC_TRUE;
2068   B->preallocated           = PETSC_TRUE;
2069   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2070   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2071   B->ops->matsolve          = NULL;
2072   B->ops->matsolvetranspose = NULL;
2073 
2074   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2075   factors->transpose_updated         = PETSC_FALSE;
2076   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2077   /* TODO: log flops, but how to know that? */
2078   PetscCall(PetscLogGpuTimeEnd());
2079   PetscFunctionReturn(PETSC_SUCCESS);
2080 }
2081 
2082 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2083 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2084 {
2085   Mat_SeqAIJKokkos           *aijkok;
2086   Mat_SeqAIJ                 *b;
2087   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2088   PetscInt                    fill_lev = info->levels;
2089   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2090   PetscInt                    n        = A->rmap->n;
2091 
2092   PetscFunctionBegin;
2093   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2094   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2095 
2096   /* Create a spiluk handle and then do symbolic factorization */
2097   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2098   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2099 
2100   auto spiluk_handle = factors->kh.get_spiluk_handle();
2101 
2102   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2103   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2104   Kokkos::realloc(factors->iU_d, n + 1);
2105   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2106 
2107   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2108   auto i_d = aijkok->i_dual.view_device();
2109   auto j_d = aijkok->j_dual.view_device();
2110   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2111   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2112 
2113   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2114   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2115   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2116   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2117 
2118   /* TODO: add options to select sptrsv algorithms */
2119   /* Create sptrsv handles for L, U and their transpose */
2120 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2121   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2122 #else
2123   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2124 #endif
2125 
2126   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2127   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2128   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2129   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2130 
2131   /* Fill fields of the factor matrix B */
2132   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2133   b     = (Mat_SeqAIJ *)B->data;
2134   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2135   B->info.fill_ratio_given  = info->fill;
2136   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2137 
2138   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2139   PetscFunctionReturn(PETSC_SUCCESS);
2140 }
2141 
2142 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2143 {
2144   PetscFunctionBegin;
2145   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2146   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2147   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2148   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2149   PetscFunctionReturn(PETSC_SUCCESS);
2150 }
2151 
2152 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2153 {
2154   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2155 
2156   PetscFunctionBegin;
2157   if (!info->factoronhost) {
2158     PetscCall(ISIdentity(isrow, &row_identity));
2159     PetscCall(ISIdentity(iscol, &col_identity));
2160   }
2161 
2162   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2163   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2164 
2165   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2166     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2167   } else {
2168     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2169     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2170   }
2171   PetscFunctionReturn(PETSC_SUCCESS);
2172 }
2173 
2174 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2175 {
2176   PetscFunctionBegin;
2177   PetscCall(MatSeqAIJKokkosSyncHost(A));
2178   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2179 
2180   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2181     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2182     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2183     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2184     const MatScalar            *Ba = b->a;
2185     PetscInt                    m  = B->rmap->n;
2186 
2187     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2188       // Allocate memory and copy the structure
2189       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2190       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2191       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2192       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2193       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2194       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2195 
2196       // Build jU_h from the skewed Aj
2197       PetscInt *Uj = factors->jU_h.data();
2198       for (PetscInt i = 0; i < m; i++) {
2199         PetscInt ulen = Bi[i + 1] - Bi[i];
2200         Uj[Bi[i]]     = i;                                              // diagonal entry
2201         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2202       }
2203 
2204       // Copy iU, jU to device
2205       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2206       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2207 
2208       // Copy row/col permutation to device
2209       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2210       PetscBool row_identity;
2211       PetscCall(ISIdentity(rowperm, &row_identity));
2212       if (!row_identity) {
2213         const PetscInt *ip;
2214 
2215         PetscCall(ISGetIndices(rowperm, &ip));
2216         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2217         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2218         PetscCall(ISRestoreIndices(rowperm, &ip));
2219         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2220       }
2221 
2222       // Create sptrsv handles for U and U^T
2223 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2224       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2225 #else
2226       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2227 #endif
2228       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2229       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2230     }
2231     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2232     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2233     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2234 
2235     // Copy the value
2236     PetscScalar *Ua = factors->aU_h.data();
2237     PetscScalar *D  = factors->D_h.data();
2238     for (PetscInt i = 0; i < m; i++) {
2239       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2240       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2241       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2242     }
2243     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2244     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2245 
2246     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2247     factors->transpose_updated         = PETSC_FALSE;
2248   }
2249 
2250   B->ops->matsolve          = NULL;
2251   B->ops->matsolvetranspose = NULL;
2252   PetscFunctionReturn(PETSC_SUCCESS);
2253 }
2254 
2255 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2256 {
2257   PetscFunctionBegin;
2258   if (info->solveonhost) {
2259     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2260     PetscCall(MatSetType(B, MATSEQSBAIJ));
2261     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2262   }
2263 
2264   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2265 
2266   if (!info->solveonhost) {
2267     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2268     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2269     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2270     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2271   }
2272   PetscFunctionReturn(PETSC_SUCCESS);
2273 }
2274 
2275 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2276 {
2277   PetscFunctionBegin;
2278   if (info->solveonhost) {
2279     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2280     PetscCall(MatSetType(B, MATSEQSBAIJ));
2281     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2282   }
2283 
2284   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2285 
2286   if (!info->solveonhost) {
2287     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2288     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2289     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2290     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2291   }
2292   PetscFunctionReturn(PETSC_SUCCESS);
2293 }
2294 
2295 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2296 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2297 {
2298   PetscFunctionBegin;
2299   *type = MATSOLVERKOKKOS;
2300   PetscFunctionReturn(PETSC_SUCCESS);
2301 }
2302 
2303 /*MC
2304   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2305   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2306 
2307   Level: beginner
2308 
2309 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`
2310 M*/
2311 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2312 {
2313   PetscInt n = A->rmap->n;
2314   MPI_Comm comm;
2315 
2316   PetscFunctionBegin;
2317   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2318   PetscCall(MatCreate(comm, B));
2319   PetscCall(MatSetSizes(*B, n, n, n, n));
2320   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2321   (*B)->factortype = ftype;
2322   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2323   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2324   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2325 
2326   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2327     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2328     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2329     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2330     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2331     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2332   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2333     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2334     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2335     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2336     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2337   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2338 
2339   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2340   (*B)->canuseordering = PETSC_TRUE;
2341   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2342   PetscFunctionReturn(PETSC_SUCCESS);
2343 }
2344 
2345 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2346 {
2347   PetscFunctionBegin;
2348   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2349   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2350   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2351   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2352   PetscFunctionReturn(PETSC_SUCCESS);
2353 }
2354 
2355 /* Utility to print out a KokkosCsrMatrix for debugging */
2356 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2357 {
2358   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2359   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2360   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2361   const PetscInt    *i  = iv.data();
2362   const PetscInt    *j  = jv.data();
2363   const PetscScalar *a  = av.data();
2364   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2365 
2366   PetscFunctionBegin;
2367   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2368   for (PetscInt k = 0; k < m; k++) {
2369     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2370     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2371     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2372   }
2373   PetscFunctionReturn(PETSC_SUCCESS);
2374 }
2375