xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision ac84dfd5778759083efa0c46d3820bac8a11500e)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20 #include <KokkosSparse_spmv.hpp>
21 PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
22 
23 #include <KokkosSparse_spiluk.hpp>
24 #include <KokkosSparse_sptrsv.hpp>
25 #include <KokkosSparse_spgemm.hpp>
26 #include <KokkosSparse_spadd.hpp>
27 #include <KokkosBatched_LU_Decl.hpp>
28 #include <KokkosBatched_InverseLU_Decl.hpp>
29 
30 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31 
32 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33   #include <KokkosSparse_Utils.hpp>
34 using KokkosSparse::sort_crs_matrix;
35 using KokkosSparse::Impl::transpose_matrix;
36 #else
37   #include <KokkosKernels_Sorting.hpp>
38 using KokkosKernels::sort_crs_matrix;
39 using KokkosKernels::Impl::transpose_matrix;
40 #endif
41 
42 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43 using KokkosSparse::spiluk_symbolic;
44 using KokkosSparse::spiluk_numeric;
45 using KokkosSparse::sptrsv_symbolic;
46 using KokkosSparse::sptrsv_solve;
47 using KokkosSparse::Experimental::SPTRSVAlgorithm;
48 using KokkosSparse::Experimental::SPILUKAlgorithm;
49 #else
50 using KokkosSparse::Experimental::spiluk_symbolic;
51 using KokkosSparse::Experimental::spiluk_numeric;
52 using KokkosSparse::Experimental::sptrsv_symbolic;
53 using KokkosSparse::Experimental::sptrsv_solve;
54 using KokkosSparse::Experimental::SPTRSVAlgorithm;
55 using KokkosSparse::Experimental::SPILUKAlgorithm;
56 #endif
57 
58 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59 
60 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62    In the latter case, it is important to set a_dual's sync state correctly.
63  */
64 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65 {
66   Mat_SeqAIJ       *aijseq;
67   Mat_SeqAIJKokkos *aijkok;
68 
69   PetscFunctionBegin;
70   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72 
73   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75 
76   /* If aijkok does not exist, we just copy i, j to device.
77      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78      In both cases, we build a new aijkok structure.
79   */
80   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
82       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
83       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
84       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
85       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
86       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
87       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
88       aijseq->free_a  = PETSC_TRUE;
89       aijseq->free_ij = PETSC_TRUE;
90       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
91          we have the CsrMatrix on device first and then copy to host, followed by
92          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
93          One could improve it by not using NULL garray.
94       */
95     }
96     delete aijkok;
97     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
98     A->spptr = aijkok;
99   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
100     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
101     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
102     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
103   }
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 
107 /* Sync CSR data to device if not yet */
108 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111 
112   PetscFunctionBegin;
113   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
114   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
115   if (aijkok->a_dual.need_sync_device()) {
116     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
117     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
118     aijkok->hermitian_updated = PETSC_FALSE;
119   }
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 /* Mark the CSR data on device as modified */
124 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127 
128   PetscFunctionBegin;
129   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
130   aijkok->a_dual.clear_sync_state();
131   aijkok->a_dual.modify_device();
132   aijkok->transpose_updated = PETSC_FALSE;
133   aijkok->hermitian_updated = PETSC_FALSE;
134   PetscCall(MatSeqAIJInvalidateDiagonal(A));
135   PetscCall(PetscObjectStateIncrease((PetscObject)A));
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140 {
141   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142   auto              exec   = PetscGetKokkosExecutionSpace();
143 
144   PetscFunctionBegin;
145   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
146   /* We do not expect one needs factors on host  */
147   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
148   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
150   PetscFunctionReturn(PETSC_SUCCESS);
151 }
152 
153 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154 {
155   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
156 
157   PetscFunctionBegin;
158   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
159     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
160     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
161     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
162   */
163   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
165     *array = aijkok->a_dual.view_host().data();
166   } else { /* Happens when calling MatSetValues on a newly created matrix */
167     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
168   }
169   PetscFunctionReturn(PETSC_SUCCESS);
170 }
171 
172 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
173 {
174   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
175 
176   PetscFunctionBegin;
177   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
178   PetscFunctionReturn(PETSC_SUCCESS);
179 }
180 
181 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
182 {
183   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
184 
185   PetscFunctionBegin;
186   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
187     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
188     *array = aijkok->a_dual.view_host().data();
189   } else {
190     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
191   }
192   PetscFunctionReturn(PETSC_SUCCESS);
193 }
194 
195 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
196 {
197   PetscFunctionBegin;
198   *array = NULL;
199   PetscFunctionReturn(PETSC_SUCCESS);
200 }
201 
202 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
203 {
204   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
205 
206   PetscFunctionBegin;
207   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
208     *array = aijkok->a_dual.view_host().data();
209   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
210     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
211   }
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 
215 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
216 {
217   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
218 
219   PetscFunctionBegin;
220   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
221     aijkok->a_dual.clear_sync_state();
222     aijkok->a_dual.modify_host();
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
228 {
229   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
230 
231   PetscFunctionBegin;
232   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
233 
234   if (i) *i = aijkok->i_device_data();
235   if (j) *j = aijkok->j_device_data();
236   if (a) {
237     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
238     *a = aijkok->a_device_data();
239   }
240   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243 
244 /*
245   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
246 
247   Input Parameter:
248 .  A       - the MATSEQAIJKOKKOS matrix
249 
250   Output Parameters:
251 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
252 -  T_d    - the transpose on device, whose value array is allocated but not initialized
253 */
254 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
255 {
256   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
257   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
258   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
259   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
260   MatRowMapType          *Ti = Ti_h.data();
261   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
262   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
263   PetscInt               *Tj   = Tj_h.data();
264   PetscInt               *perm = perm_h.data();
265   PetscInt               *offset;
266 
267   PetscFunctionBegin;
268   // Populate Ti
269   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
270   Ti++;
271   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
272   Ti--;
273   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
274 
275   // Populate Tj and the permutation array
276   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
277   for (PetscInt i = 0; i < m; i++) {
278     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
279       PetscInt r    = Aj[j];                       // row r of T
280       PetscInt disp = Ti[r] + offset[r];
281 
282       Tj[disp]   = i; // col i of T
283       perm[disp] = j;
284       offset[r]++;
285     }
286   }
287   PetscCall(PetscFree(offset));
288 
289   // Sort each row of T, along with the permutation array
290   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
291 
292   // Output perm and T on device
293   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
294   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
295   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
296   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
297   PetscFunctionReturn(PETSC_SUCCESS);
298 }
299 
300 // Generate the transpose on device and cache it internally
301 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
302 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
303 {
304   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
305   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
306   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
307   KokkosCsrMatrix  &T = akok->csrmatT;
308 
309   PetscFunctionBegin;
310   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
311   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
312 
313   const auto &Aa = akok->a_dual.view_device();
314 
315   if (A->symmetric == PETSC_BOOL3_TRUE) {
316     *csrmatT = akok->csrmat;
317   } else {
318     // See if we already have a cached transpose and its value is up to date
319     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
320       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
321         const auto &perm = akok->transpose_perm; // get the permutation array
322         auto       &Ta   = T.values;
323 
324         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
325       }
326     } else { // Generate T of size n x m for the first time
327       MatRowMapKokkosView perm;
328 
329       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
330       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
331       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
332     }
333     akok->transpose_updated = PETSC_TRUE;
334     *csrmatT                = akok->csrmatT;
335   }
336   PetscFunctionReturn(PETSC_SUCCESS);
337 }
338 
339 // Generate the Hermitian on device and cache it internally
340 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
341 {
342   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
343   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
344   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
345   KokkosCsrMatrix  &T = akok->csrmatH;
346 
347   PetscFunctionBegin;
348   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
349   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
350 
351   const auto &Aa = akok->a_dual.view_device();
352 
353   if (A->hermitian == PETSC_BOOL3_TRUE) {
354     *csrmatH = akok->csrmat;
355   } else {
356     // See if we already have a cached hermitian and its value is up to date
357     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
358       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
359         const auto &perm = akok->transpose_perm; // get the permutation array
360         auto       &Ta   = T.values;
361 
362         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
363       }
364     } else { // Generate T of size n x m for the first time
365       MatRowMapKokkosView perm;
366 
367       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
368       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
369       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
370     }
371     akok->hermitian_updated = PETSC_TRUE;
372     *csrmatH                = akok->csrmatH;
373   }
374   PetscFunctionReturn(PETSC_SUCCESS);
375 }
376 
377 /* y = A x */
378 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
379 {
380   Mat_SeqAIJKokkos          *aijkok;
381   ConstPetscScalarKokkosView xv;
382   PetscScalarKokkosView      yv;
383 
384   PetscFunctionBegin;
385   PetscCall(PetscLogGpuTimeBegin());
386   PetscCall(MatSeqAIJKokkosSyncDevice(A));
387   PetscCall(VecGetKokkosView(xx, &xv));
388   PetscCall(VecGetKokkosViewWrite(yy, &yv));
389   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
390   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
391   PetscCall(VecRestoreKokkosView(xx, &xv));
392   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
393   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
394   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
395   PetscCall(PetscLogGpuTimeEnd());
396   PetscFunctionReturn(PETSC_SUCCESS);
397 }
398 
399 /* y = A^T x */
400 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
401 {
402   Mat_SeqAIJKokkos          *aijkok;
403   const char                *mode;
404   ConstPetscScalarKokkosView xv;
405   PetscScalarKokkosView      yv;
406   KokkosCsrMatrix            csrmat;
407 
408   PetscFunctionBegin;
409   PetscCall(PetscLogGpuTimeBegin());
410   PetscCall(MatSeqAIJKokkosSyncDevice(A));
411   PetscCall(VecGetKokkosView(xx, &xv));
412   PetscCall(VecGetKokkosViewWrite(yy, &yv));
413   if (A->form_explicit_transpose) {
414     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
415     mode = "N";
416   } else {
417     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
418     csrmat = aijkok->csrmat;
419     mode   = "T";
420   }
421   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
422   PetscCall(VecRestoreKokkosView(xx, &xv));
423   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
424   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
425   PetscCall(PetscLogGpuTimeEnd());
426   PetscFunctionReturn(PETSC_SUCCESS);
427 }
428 
429 /* y = A^H x */
430 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
431 {
432   Mat_SeqAIJKokkos          *aijkok;
433   const char                *mode;
434   ConstPetscScalarKokkosView xv;
435   PetscScalarKokkosView      yv;
436   KokkosCsrMatrix            csrmat;
437 
438   PetscFunctionBegin;
439   PetscCall(PetscLogGpuTimeBegin());
440   PetscCall(MatSeqAIJKokkosSyncDevice(A));
441   PetscCall(VecGetKokkosView(xx, &xv));
442   PetscCall(VecGetKokkosViewWrite(yy, &yv));
443   if (A->form_explicit_transpose) {
444     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
445     mode = "N";
446   } else {
447     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
448     csrmat = aijkok->csrmat;
449     mode   = "C";
450   }
451   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
452   PetscCall(VecRestoreKokkosView(xx, &xv));
453   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
454   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
455   PetscCall(PetscLogGpuTimeEnd());
456   PetscFunctionReturn(PETSC_SUCCESS);
457 }
458 
459 /* z = A x + y */
460 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
461 {
462   Mat_SeqAIJKokkos          *aijkok;
463   ConstPetscScalarKokkosView xv;
464   PetscScalarKokkosView      zv;
465 
466   PetscFunctionBegin;
467   PetscCall(PetscLogGpuTimeBegin());
468   PetscCall(MatSeqAIJKokkosSyncDevice(A));
469   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
470   PetscCall(VecGetKokkosView(xx, &xv));
471   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
472   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
473   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
474   PetscCall(VecRestoreKokkosView(xx, &xv));
475   PetscCall(VecRestoreKokkosView(zz, &zv));
476   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
477   PetscCall(PetscLogGpuTimeEnd());
478   PetscFunctionReturn(PETSC_SUCCESS);
479 }
480 
481 /* z = A^T x + y */
482 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
483 {
484   Mat_SeqAIJKokkos          *aijkok;
485   const char                *mode;
486   ConstPetscScalarKokkosView xv;
487   PetscScalarKokkosView      zv;
488   KokkosCsrMatrix            csrmat;
489 
490   PetscFunctionBegin;
491   PetscCall(PetscLogGpuTimeBegin());
492   PetscCall(MatSeqAIJKokkosSyncDevice(A));
493   if (zz != yy) PetscCall(VecCopy(yy, zz));
494   PetscCall(VecGetKokkosView(xx, &xv));
495   PetscCall(VecGetKokkosView(zz, &zv));
496   if (A->form_explicit_transpose) {
497     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
498     mode = "N";
499   } else {
500     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
501     csrmat = aijkok->csrmat;
502     mode   = "T";
503   }
504   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
505   PetscCall(VecRestoreKokkosView(xx, &xv));
506   PetscCall(VecRestoreKokkosView(zz, &zv));
507   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
508   PetscCall(PetscLogGpuTimeEnd());
509   PetscFunctionReturn(PETSC_SUCCESS);
510 }
511 
512 /* z = A^H x + y */
513 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
514 {
515   Mat_SeqAIJKokkos          *aijkok;
516   const char                *mode;
517   ConstPetscScalarKokkosView xv;
518   PetscScalarKokkosView      zv;
519   KokkosCsrMatrix            csrmat;
520 
521   PetscFunctionBegin;
522   PetscCall(PetscLogGpuTimeBegin());
523   PetscCall(MatSeqAIJKokkosSyncDevice(A));
524   if (zz != yy) PetscCall(VecCopy(yy, zz));
525   PetscCall(VecGetKokkosView(xx, &xv));
526   PetscCall(VecGetKokkosView(zz, &zv));
527   if (A->form_explicit_transpose) {
528     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
529     mode = "N";
530   } else {
531     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
532     csrmat = aijkok->csrmat;
533     mode   = "C";
534   }
535   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
536   PetscCall(VecRestoreKokkosView(xx, &xv));
537   PetscCall(VecRestoreKokkosView(zz, &zv));
538   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
539   PetscCall(PetscLogGpuTimeEnd());
540   PetscFunctionReturn(PETSC_SUCCESS);
541 }
542 
543 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
544 {
545   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
546 
547   PetscFunctionBegin;
548   switch (op) {
549   case MAT_FORM_EXPLICIT_TRANSPOSE:
550     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
551     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
552     A->form_explicit_transpose = flg;
553     break;
554   default:
555     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
556     break;
557   }
558   PetscFunctionReturn(PETSC_SUCCESS);
559 }
560 
561 /* Depending on reuse, either build a new mat, or use the existing mat */
562 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
563 {
564   Mat_SeqAIJ *aseq;
565 
566   PetscFunctionBegin;
567   PetscCall(PetscKokkosInitializeCheck());
568   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
569     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
570     PetscCall(MatSetType(*newmat, mtype));
571   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
572     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
573   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
574     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
575     PetscCall(PetscFree(A->defaultvectype));
576     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
577     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
578     PetscCall(MatSetOps_SeqAIJKokkos(A));
579     aseq = static_cast<Mat_SeqAIJ *>(A->data);
580     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
581       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
582       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
583     }
584   }
585   PetscFunctionReturn(PETSC_SUCCESS);
586 }
587 
588 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
589    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
590  */
591 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
592 {
593   Mat_SeqAIJ       *bseq;
594   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
595   Mat               mat;
596 
597   PetscFunctionBegin;
598   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
599   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
600   mat = *B;
601   if (A->assembled) {
602     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
603     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
604     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
605     /* Now copy values to B if needed */
606     if (dupOption == MAT_COPY_VALUES) {
607       if (akok->a_dual.need_sync_device()) {
608         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
609         bkok->a_dual.modify_host();
610       } else { /* If device has the latest data, we only copy data on device */
611         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
612         bkok->a_dual.modify_device();
613       }
614     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
615       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
616       bkok->a_dual.modify_host();
617     }
618     mat->spptr = bkok;
619   }
620 
621   PetscCall(PetscFree(mat->defaultvectype));
622   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
623   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
624   PetscCall(MatSetOps_SeqAIJKokkos(mat));
625   PetscFunctionReturn(PETSC_SUCCESS);
626 }
627 
628 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
629 {
630   Mat               At;
631   KokkosCsrMatrix   internT;
632   Mat_SeqAIJKokkos *atkok, *bkok;
633 
634   PetscFunctionBegin;
635   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
636   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
637   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
638     /* Deep copy internT, as we want to isolate the internal transpose */
639     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
640     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
641     if (reuse == MAT_INITIAL_MATRIX) *B = At;
642     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
643   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
644     if ((*B)->assembled) {
645       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
646       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
647       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
648     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
649       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
650       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
651       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
652       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
653       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
654     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
655   }
656   PetscFunctionReturn(PETSC_SUCCESS);
657 }
658 
659 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
660 {
661   Mat_SeqAIJKokkos *aijkok;
662 
663   PetscFunctionBegin;
664   if (A->factortype == MAT_FACTOR_NONE) {
665     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
666     delete aijkok;
667   } else {
668     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
669   }
670   A->spptr = NULL;
671   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
672   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
673   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
674 #if defined(PETSC_HAVE_HYPRE)
675   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
676 #endif
677   PetscCall(MatDestroy_SeqAIJ(A));
678   PetscFunctionReturn(PETSC_SUCCESS);
679 }
680 
681 /*MC
682    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
683 
684    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
685 
686    Options Database Key:
687 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
688 
689   Level: beginner
690 
691 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
692 M*/
693 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
694 {
695   PetscFunctionBegin;
696   PetscCall(PetscKokkosInitializeCheck());
697   PetscCall(MatCreate_SeqAIJ(A));
698   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
699   PetscFunctionReturn(PETSC_SUCCESS);
700 }
701 
702 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
703 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
704 {
705   Mat_SeqAIJ         *a, *b;
706   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
707   MatScalarKokkosView aa, ba, ca;
708   MatRowMapKokkosView ai, bi, ci;
709   MatColIdxKokkosView aj, bj, cj;
710   PetscInt            m, n, nnz, aN;
711 
712   PetscFunctionBegin;
713   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
714   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
715   PetscAssertPointer(C, 4);
716   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
717   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
718   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
719   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
720 
721   PetscCall(MatSeqAIJKokkosSyncDevice(A));
722   PetscCall(MatSeqAIJKokkosSyncDevice(B));
723   a    = static_cast<Mat_SeqAIJ *>(A->data);
724   b    = static_cast<Mat_SeqAIJ *>(B->data);
725   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
726   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
727   aa   = akok->a_dual.view_device();
728   ai   = akok->i_dual.view_device();
729   ba   = bkok->a_dual.view_device();
730   bi   = bkok->i_dual.view_device();
731   m    = A->rmap->n; /* M, N and nnz of C */
732   n    = A->cmap->n + B->cmap->n;
733   nnz  = a->nz + b->nz;
734   aN   = A->cmap->n; /* N of A */
735   if (reuse == MAT_INITIAL_MATRIX) {
736     aj      = akok->j_dual.view_device();
737     bj      = bkok->j_dual.view_device();
738     auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
739     auto ci = MatRowMapKokkosView("i", ai.extent(0));
740     auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
741 
742     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
743     Kokkos::parallel_for(
744       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
745         PetscInt i       = t.league_rank(); /* row i */
746         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
747 
748         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
749                                                    ci(i) = coffset;
750                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
751         });
752 
753         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
754           if (k < alen) {
755             ca(coffset + k) = aa(ai(i) + k);
756             cj(coffset + k) = aj(ai(i) + k);
757           } else {
758             ca(coffset + k) = ba(bi(i) + k - alen);
759             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
760           }
761         });
762       });
763     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
764     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
765   } else if (reuse == MAT_REUSE_MATRIX) {
766     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
767     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
768     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
769     ca   = ckok->a_dual.view_device();
770     ci   = ckok->i_dual.view_device();
771 
772     Kokkos::parallel_for(
773       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
774         PetscInt i    = t.league_rank(); /* row i */
775         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
776         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
777           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
778           else ca(ci(i) + k) = ba(bi(i) + k - alen);
779         });
780       });
781     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
782   }
783   PetscFunctionReturn(PETSC_SUCCESS);
784 }
785 
786 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
787 {
788   PetscFunctionBegin;
789   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
790   PetscFunctionReturn(PETSC_SUCCESS);
791 }
792 
793 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
794 {
795   Mat_Product                 *product = C->product;
796   Mat                          A, B;
797   bool                         transA, transB; /* use bool, since KK needs this type */
798   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
799   Mat_SeqAIJ                  *c;
800   MatProductData_SeqAIJKokkos *pdata;
801   KokkosCsrMatrix              csrmatA, csrmatB;
802 
803   PetscFunctionBegin;
804   MatCheckProduct(C, 1);
805   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
806   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
807 
808   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
809   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
810   // we still do numeric.
811   if (pdata->reusesym) { // numeric reuses results from symbolic
812     pdata->reusesym = PETSC_FALSE;
813     PetscFunctionReturn(PETSC_SUCCESS);
814   }
815 
816   switch (product->type) {
817   case MATPRODUCT_AB:
818     transA = false;
819     transB = false;
820     break;
821   case MATPRODUCT_AtB:
822     transA = true;
823     transB = false;
824     break;
825   case MATPRODUCT_ABt:
826     transA = false;
827     transB = true;
828     break;
829   default:
830     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
831   }
832 
833   A = product->A;
834   B = product->B;
835   PetscCall(MatSeqAIJKokkosSyncDevice(A));
836   PetscCall(MatSeqAIJKokkosSyncDevice(B));
837   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
838   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
839   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
840 
841   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
842 
843   csrmatA = akok->csrmat;
844   csrmatB = bkok->csrmat;
845 
846   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
847   if (transA) {
848     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
849     transA = false;
850   }
851 
852   if (transB) {
853     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
854     transB = false;
855   }
856   PetscCall(PetscLogGpuTimeBegin());
857   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
858 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
859   auto spgemmHandle = pdata->kh.get_spgemm_handle();
860   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
861 #endif
862 
863   PetscCall(PetscLogGpuTimeEnd());
864   PetscCall(MatSeqAIJKokkosModifyDevice(C));
865   /* shorter version of MatAssemblyEnd_SeqAIJ */
866   c = (Mat_SeqAIJ *)C->data;
867   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
868   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
869   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
870   c->reallocs         = 0;
871   C->info.mallocs     = 0;
872   C->info.nz_unneeded = 0;
873   C->assembled = C->was_assembled = PETSC_TRUE;
874   C->num_ass++;
875   PetscFunctionReturn(PETSC_SUCCESS);
876 }
877 
878 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
879 {
880   Mat_Product                 *product = C->product;
881   MatProductType               ptype;
882   Mat                          A, B;
883   bool                         transA, transB;
884   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
885   MatProductData_SeqAIJKokkos *pdata;
886   MPI_Comm                     comm;
887   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
888 
889   PetscFunctionBegin;
890   MatCheckProduct(C, 1);
891   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
892   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
893   A = product->A;
894   B = product->B;
895   PetscCall(MatSeqAIJKokkosSyncDevice(A));
896   PetscCall(MatSeqAIJKokkosSyncDevice(B));
897   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
898   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
899   csrmatA = akok->csrmat;
900   csrmatB = bkok->csrmat;
901 
902   ptype = product->type;
903   // Take advantage of the symmetry if true
904   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
905     ptype                                          = MATPRODUCT_AB;
906     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
907   }
908   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
909     ptype                                          = MATPRODUCT_AB;
910     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
911   }
912 
913   switch (ptype) {
914   case MATPRODUCT_AB:
915     transA = false;
916     transB = false;
917     PetscCall(MatSetBlockSizesFromMats(C, A, B));
918     break;
919   case MATPRODUCT_AtB:
920     transA = true;
921     transB = false;
922     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
923     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
924     break;
925   case MATPRODUCT_ABt:
926     transA = false;
927     transB = true;
928     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
929     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
930     break;
931   default:
932     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
933   }
934   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
935   pdata->reusesym = product->api_user;
936 
937   /* TODO: add command line options to select spgemm algorithms */
938   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
939 
940   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
941 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
942   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
943   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
944   #endif
945 #endif
946   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
947 
948   PetscCall(PetscLogGpuTimeBegin());
949   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
950   if (transA) {
951     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
952     transA = false;
953   }
954 
955   if (transB) {
956     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
957     transB = false;
958   }
959 
960   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
961   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
962     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
963     calling new Mat_SeqAIJKokkos().
964     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
965   */
966   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
967 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
968   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
969   auto spgemmHandle = pdata->kh.get_spgemm_handle();
970   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
971 #endif
972   PetscCall(PetscLogGpuTimeEnd());
973 
974   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
975   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
976   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
977   PetscFunctionReturn(PETSC_SUCCESS);
978 }
979 
980 /* handles sparse matrix matrix ops */
981 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
982 {
983   Mat_Product *product = mat->product;
984   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
985 
986   PetscFunctionBegin;
987   MatCheckProduct(mat, 1);
988   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
989   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
990   if (Biskok && Ciskok) {
991     switch (product->type) {
992     case MATPRODUCT_AB:
993     case MATPRODUCT_AtB:
994     case MATPRODUCT_ABt:
995       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
996       break;
997     case MATPRODUCT_PtAP:
998     case MATPRODUCT_RARt:
999     case MATPRODUCT_ABC:
1000       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1001       break;
1002     default:
1003       break;
1004     }
1005   } else { /* fallback for AIJ */
1006     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1007   }
1008   PetscFunctionReturn(PETSC_SUCCESS);
1009 }
1010 
1011 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1012 {
1013   Mat_SeqAIJKokkos *aijkok;
1014 
1015   PetscFunctionBegin;
1016   PetscCall(PetscLogGpuTimeBegin());
1017   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1018   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1019   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1020   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1021   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1022   PetscCall(PetscLogGpuTimeEnd());
1023   PetscFunctionReturn(PETSC_SUCCESS);
1024 }
1025 
1026 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1027 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1028 {
1029   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1030 
1031   PetscFunctionBegin;
1032   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1033     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1034 
1035     PetscCall(PetscLogGpuTimeBegin());
1036     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1037     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1038     const auto &Aa     = aijkok->a_dual.view_device();
1039     const auto &Adiag  = aijkok->diag_dual.view_device();
1040     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1041     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1042     PetscCall(PetscLogGpuFlops(n));
1043     PetscCall(PetscLogGpuTimeEnd());
1044   } else { // need reassembly, very slow!
1045     PetscCall(MatShift_Basic(A, a));
1046   }
1047   PetscFunctionReturn(PETSC_SUCCESS);
1048 }
1049 
1050 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1051 {
1052   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1053 
1054   PetscFunctionBegin;
1055   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1056     ConstPetscScalarKokkosView dv;
1057     PetscInt                   n, nv;
1058 
1059     PetscCall(PetscLogGpuTimeBegin());
1060     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1061     PetscCall(VecGetKokkosView(D, &dv));
1062     PetscCall(VecGetLocalSize(D, &nv));
1063     n = PetscMin(Y->rmap->n, Y->cmap->n);
1064     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1065 
1066     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1067     const auto &Aa     = aijkok->a_dual.view_device();
1068     const auto &Adiag  = aijkok->diag_dual.view_device();
1069     PetscCallCXX(Kokkos::parallel_for(
1070       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1071         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1072         else Aa(Adiag(i)) += dv(i);
1073       }));
1074     PetscCall(VecRestoreKokkosView(D, &dv));
1075     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1076     PetscCall(PetscLogGpuFlops(n));
1077     PetscCall(PetscLogGpuTimeEnd());
1078   } else { // need reassembly, very slow!
1079     PetscCall(MatDiagonalSet_Default(Y, D, is));
1080   }
1081   PetscFunctionReturn(PETSC_SUCCESS);
1082 }
1083 
1084 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1085 {
1086   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1087   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1088   ConstPetscScalarKokkosView lv, rv;
1089 
1090   PetscFunctionBegin;
1091   PetscCall(PetscLogGpuTimeBegin());
1092   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1093   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1094   const auto &Aa     = aijkok->a_dual.view_device();
1095   const auto &Ai     = aijkok->i_dual.view_device();
1096   const auto &Aj     = aijkok->j_dual.view_device();
1097   if (ll) {
1098     PetscCall(VecGetLocalSize(ll, &m));
1099     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1100     PetscCall(VecGetKokkosView(ll, &lv));
1101     PetscCallCXX(Kokkos::parallel_for( // for each row
1102       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1103         PetscInt i   = t.league_rank(); // row i
1104         PetscInt len = Ai(i + 1) - Ai(i);
1105         // scale entries on the row
1106         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1107       }));
1108     PetscCall(VecRestoreKokkosView(ll, &lv));
1109     PetscCall(PetscLogGpuFlops(nz));
1110   }
1111   if (rr) {
1112     PetscCall(VecGetLocalSize(rr, &n));
1113     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1114     PetscCall(VecGetKokkosView(rr, &rv));
1115     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1116       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1117     PetscCall(VecRestoreKokkosView(rr, &lv));
1118     PetscCall(PetscLogGpuFlops(nz));
1119   }
1120   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1121   PetscCall(PetscLogGpuTimeEnd());
1122   PetscFunctionReturn(PETSC_SUCCESS);
1123 }
1124 
1125 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1126 {
1127   Mat_SeqAIJKokkos *aijkok;
1128 
1129   PetscFunctionBegin;
1130   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1131   if (aijkok) { /* Only zero the device if data is already there */
1132     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1133     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1134   } else { /* Might be preallocated but not assembled */
1135     PetscCall(MatZeroEntries_SeqAIJ(A));
1136   }
1137   PetscFunctionReturn(PETSC_SUCCESS);
1138 }
1139 
1140 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1141 {
1142   Mat_SeqAIJKokkos     *aijkok;
1143   PetscInt              n;
1144   PetscScalarKokkosView xv;
1145 
1146   PetscFunctionBegin;
1147   PetscCall(VecGetLocalSize(x, &n));
1148   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1149   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1150 
1151   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1152   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1153 
1154   const auto &Aa    = aijkok->a_dual.view_device();
1155   const auto &Ai    = aijkok->i_dual.view_device();
1156   const auto &Adiag = aijkok->diag_dual.view_device();
1157 
1158   PetscCall(VecGetKokkosViewWrite(x, &xv));
1159   Kokkos::parallel_for(
1160     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1161       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1162       else xv(i) = 0;
1163     });
1164   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1165   PetscFunctionReturn(PETSC_SUCCESS);
1166 }
1167 
1168 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1169 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1170 {
1171   Mat_SeqAIJKokkos *aijkok;
1172 
1173   PetscFunctionBegin;
1174   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1175   PetscAssertPointer(kv, 2);
1176   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1177   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1178   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1179   *kv    = aijkok->a_dual.view_device();
1180   PetscFunctionReturn(PETSC_SUCCESS);
1181 }
1182 
1183 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1184 {
1185   PetscFunctionBegin;
1186   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1187   PetscAssertPointer(kv, 2);
1188   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1189   PetscFunctionReturn(PETSC_SUCCESS);
1190 }
1191 
1192 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1193 {
1194   Mat_SeqAIJKokkos *aijkok;
1195 
1196   PetscFunctionBegin;
1197   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1198   PetscAssertPointer(kv, 2);
1199   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1200   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1201   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1202   *kv    = aijkok->a_dual.view_device();
1203   PetscFunctionReturn(PETSC_SUCCESS);
1204 }
1205 
1206 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1207 {
1208   PetscFunctionBegin;
1209   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1210   PetscAssertPointer(kv, 2);
1211   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1212   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1213   PetscFunctionReturn(PETSC_SUCCESS);
1214 }
1215 
1216 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1217 {
1218   Mat_SeqAIJKokkos *aijkok;
1219 
1220   PetscFunctionBegin;
1221   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1222   PetscAssertPointer(kv, 2);
1223   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1224   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1225   *kv    = aijkok->a_dual.view_device();
1226   PetscFunctionReturn(PETSC_SUCCESS);
1227 }
1228 
1229 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1230 {
1231   PetscFunctionBegin;
1232   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1233   PetscAssertPointer(kv, 2);
1234   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1235   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1236   PetscFunctionReturn(PETSC_SUCCESS);
1237 }
1238 
1239 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1240 {
1241   Mat_SeqAIJKokkos *akok;
1242 
1243   PetscFunctionBegin;
1244   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1245   PetscCall(MatCreate(comm, A));
1246   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1247   PetscFunctionReturn(PETSC_SUCCESS);
1248 }
1249 
1250 /* Computes Y += alpha X */
1251 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1252 {
1253   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1254   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1255   ConstMatScalarKokkosView Xa;
1256   MatScalarKokkosView      Ya;
1257   auto                     exec = PetscGetKokkosExecutionSpace();
1258 
1259   PetscFunctionBegin;
1260   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1261   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1262   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1263   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1264   PetscCall(PetscLogGpuTimeBegin());
1265 
1266   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1267     PetscBool e;
1268     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1269     if (e) {
1270       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1271       if (e) pattern = SAME_NONZERO_PATTERN;
1272     }
1273   }
1274 
1275   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1276     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1277     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1278     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1279   */
1280   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1281   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1282   Xa   = xkok->a_dual.view_device();
1283   Ya   = ykok->a_dual.view_device();
1284 
1285   if (pattern == SAME_NONZERO_PATTERN) {
1286     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1287     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1288   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1289     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1290     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1291 
1292     Kokkos::parallel_for(
1293       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1294         PetscInt i = t.league_rank(); // row i
1295         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1296           // Only one thread works in a team
1297           PetscInt p, q = Yi(i);
1298           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1299             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1300             if (Xj(p) == Yj(q)) {                        // Found it
1301               Ya(q) += alpha * Xa(p);
1302               q++;
1303             } else {
1304             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1305             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1306 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1307               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1308 #else
1309               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1310 #endif
1311             }
1312           }
1313         });
1314       });
1315     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1316   } else { // different nonzero patterns
1317     Mat             Z;
1318     KokkosCsrMatrix zcsr;
1319     KernelHandle    kh;
1320     kh.create_spadd_handle(true); // X, Y are sorted
1321     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1322     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1323     zkok = new Mat_SeqAIJKokkos(zcsr);
1324     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1325     PetscCall(MatHeaderReplace(Y, &Z));
1326     kh.destroy_spadd_handle();
1327   }
1328   PetscCall(PetscLogGpuTimeEnd());
1329   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1330   PetscFunctionReturn(PETSC_SUCCESS);
1331 }
1332 
1333 struct MatCOOStruct_SeqAIJKokkos {
1334   PetscCount           n;
1335   PetscCount           Atot;
1336   PetscInt             nz;
1337   PetscCountKokkosView jmap;
1338   PetscCountKokkosView perm;
1339 
1340   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1341   {
1342     nz   = coo_h->nz;
1343     n    = coo_h->n;
1344     Atot = coo_h->Atot;
1345     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1346     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1347   }
1348 };
1349 
1350 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1351 {
1352   PetscFunctionBegin;
1353   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1354   PetscFunctionReturn(PETSC_SUCCESS);
1355 }
1356 
1357 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1358 {
1359   Mat_SeqAIJKokkos          *akok;
1360   Mat_SeqAIJ                *aseq;
1361   PetscContainer             container_h;
1362   MatCOOStruct_SeqAIJ       *coo_h;
1363   MatCOOStruct_SeqAIJKokkos *coo_d;
1364 
1365   PetscFunctionBegin;
1366   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1367   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1368   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1369   delete akok;
1370   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1371   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1372 
1373   // Copy the COO struct to device
1374   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1375   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1376   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1377 
1378   // Put the COO struct in a container and then attach that to the matrix
1379   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1380   PetscFunctionReturn(PETSC_SUCCESS);
1381 }
1382 
1383 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1384 {
1385   MatScalarKokkosView        Aa;
1386   ConstMatScalarKokkosView   kv;
1387   PetscMemType               memtype;
1388   PetscContainer             container;
1389   MatCOOStruct_SeqAIJKokkos *coo;
1390 
1391   PetscFunctionBegin;
1392   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1393   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1394 
1395   const auto &n    = coo->n;
1396   const auto &Annz = coo->nz;
1397   const auto &jmap = coo->jmap;
1398   const auto &perm = coo->perm;
1399 
1400   PetscCall(PetscGetMemType(v, &memtype));
1401   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1402     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1403   } else {
1404     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1405   }
1406 
1407   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1408   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1409 
1410   PetscCall(PetscLogGpuTimeBegin());
1411   Kokkos::parallel_for(
1412     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1413       PetscScalar sum = 0.0;
1414       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1415       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1416     });
1417   PetscCall(PetscLogGpuTimeEnd());
1418 
1419   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1420   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1421   PetscFunctionReturn(PETSC_SUCCESS);
1422 }
1423 
1424 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1425 {
1426   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1427 
1428   PetscFunctionBegin;
1429   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1430   A->boundtocpu  = PETSC_FALSE;
1431 
1432   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1433   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1434   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1435   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1436   A->ops->scale                     = MatScale_SeqAIJKokkos;
1437   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1438   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1439   A->ops->mult                      = MatMult_SeqAIJKokkos;
1440   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1441   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1442   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1443   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1444   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1445   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1446   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1447   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1448   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1449   A->ops->shift                     = MatShift_SeqAIJKokkos;
1450   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1451   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1452   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1453   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1454   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1455   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1456   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1457   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1458   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1459 
1460   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1461   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1462 #if defined(PETSC_HAVE_HYPRE)
1463   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1464 #endif
1465   PetscFunctionReturn(PETSC_SUCCESS);
1466 }
1467 
1468 /*
1469    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1470 
1471   Input Parameters:
1472 +  A       - the MATSEQAIJKOKKOS matrix
1473 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1474 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1475 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1476 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1477 
1478   Output Parameter:
1479 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1480 */
1481 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1482 {
1483   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1484   PetscInt          nblocks = bs.extent(0) - 1;
1485 
1486   PetscFunctionBegin;
1487   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1488 
1489   // Pull out the diagonal blocks of the matrix and then invert the blocks
1490   auto Aa    = akok->a_dual.view_device();
1491   auto Ai    = akok->i_dual.view_device();
1492   auto Aj    = akok->j_dual.view_device();
1493   auto Adiag = akok->diag_dual.view_device();
1494   // TODO: how to tune the team size?
1495 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1496   auto ts = Kokkos::AUTO();
1497 #else
1498   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1499 #endif
1500   PetscCallCXX(Kokkos::parallel_for(
1501     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1502       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1503       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1504       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1505       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1506       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1507 
1508       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1509         PetscInt i = rstart + r;                                                            // i-th row in A
1510 
1511         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1512           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1513 
1514           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1515             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1516               B(r, c) = 0.0;
1517             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1518               B(r, c) = Aa(first + c);
1519             } else { // this entry does not show up in the CSR
1520               B(r, c) = 0.0;
1521             }
1522           }
1523         } else { // rare case that the diagonal does not exist
1524           const PetscInt begin = Ai(i);
1525           const PetscInt end   = Ai(i + 1);
1526           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1527           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1528             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1529             else if (Aj(j) >= rstart + m) break;
1530           }
1531         }
1532       });
1533 
1534       // LU-decompose B (w/o pivoting) and then invert B
1535       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1536       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1537     }));
1538   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1539   PetscFunctionReturn(PETSC_SUCCESS);
1540 }
1541 
1542 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1543 {
1544   Mat_SeqAIJ *aseq;
1545   PetscInt    i, m, n;
1546   auto        exec = PetscGetKokkosExecutionSpace();
1547 
1548   PetscFunctionBegin;
1549   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1550 
1551   m = akok->nrows();
1552   n = akok->ncols();
1553   PetscCall(MatSetSizes(A, m, n, m, n));
1554   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1555 
1556   /* Set up data structures of A as a MATSEQAIJ */
1557   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1558   aseq = (Mat_SeqAIJ *)A->data;
1559 
1560   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1561   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1562 
1563   aseq->i       = akok->i_host_data();
1564   aseq->j       = akok->j_host_data();
1565   aseq->a       = akok->a_host_data();
1566   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1567   aseq->free_a  = PETSC_FALSE;
1568   aseq->free_ij = PETSC_FALSE;
1569   aseq->nz      = akok->nnz();
1570   aseq->maxnz   = aseq->nz;
1571 
1572   PetscCall(PetscMalloc1(m, &aseq->imax));
1573   PetscCall(PetscMalloc1(m, &aseq->ilen));
1574   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1575 
1576   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1577   akok->nonzerostate = A->nonzerostate;
1578   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1579   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1580   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1581   PetscFunctionReturn(PETSC_SUCCESS);
1582 }
1583 
1584 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1585 {
1586   PetscFunctionBegin;
1587   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1588   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1589   PetscFunctionReturn(PETSC_SUCCESS);
1590 }
1591 
1592 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1593 {
1594   Mat_SeqAIJKokkos *akok;
1595 
1596   PetscFunctionBegin;
1597   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1598   PetscCall(MatCreate(comm, A));
1599   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1600   PetscFunctionReturn(PETSC_SUCCESS);
1601 }
1602 
1603 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1604 
1605    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1606  */
1607 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1608 {
1609   PetscFunctionBegin;
1610   PetscCall(MatCreate(comm, A));
1611   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1612   PetscFunctionReturn(PETSC_SUCCESS);
1613 }
1614 
1615 /*@C
1616   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1617   (the default parallel PETSc format). This matrix will ultimately be handled by
1618   Kokkos for calculations.
1619 
1620   Collective
1621 
1622   Input Parameters:
1623 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1624 . m    - number of rows
1625 . n    - number of columns
1626 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1627 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1628 
1629   Output Parameter:
1630 . A - the matrix
1631 
1632   Level: intermediate
1633 
1634   Notes:
1635   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1636   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1637   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1638 
1639   The AIJ format, also called
1640   compressed row storage, is fully compatible with standard Fortran
1641   storage.  That is, the stored row and column indices can begin at
1642   either one (as in Fortran) or zero.
1643 
1644   Specify the preallocated storage with either `nz` or `nnz` (not both).
1645   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1646   allocation.
1647 
1648 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1649 @*/
1650 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1651 {
1652   PetscFunctionBegin;
1653   PetscCall(PetscKokkosInitializeCheck());
1654   PetscCall(MatCreate(comm, A));
1655   PetscCall(MatSetSizes(*A, m, n, m, n));
1656   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1657   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1658   PetscFunctionReturn(PETSC_SUCCESS);
1659 }
1660 
1661 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1662 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1663 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1664 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1665 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1666 {
1667   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1668   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1669   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1670 
1671   PetscFunctionBegin;
1672   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1673     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1674     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1675     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1676   }
1677   PetscFunctionReturn(PETSC_SUCCESS);
1678 }
1679 
1680 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1681 {
1682   const PetscInt              n         = A->rmap->n;
1683   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1684   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1685   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1686 
1687   PetscFunctionBegin;
1688   if (!factors->transpose_updated) {
1689     if (has_upper) {
1690       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1691         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1692         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1693         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1694       }
1695 
1696       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1697         if (!factors->U) {
1698           Mat_SeqAIJ *seq;
1699 
1700           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1701           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1702 
1703           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1704           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1705           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1706           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1707         } else {
1708           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1709         }
1710         // Copy Ut from host to device
1711         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1712         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1713         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1714       } else { // If U was computed on device, we also compute the transpose there
1715         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1716         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1717                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1718                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1719                                                                                                                                                                                                                                factors->aUt_d));
1720         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1721       }
1722       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1723     }
1724 
1725     // do the same for L with LU
1726     if (has_lower) {
1727       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1728         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1729         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1730         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1731       }
1732 
1733       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1734         if (!factors->L) {
1735           Mat_SeqAIJ *seq;
1736 
1737           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1738           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1739 
1740           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1741           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1742           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1743           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1744         } else {
1745           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1746         }
1747         // Copy Lt from host to device
1748         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1749         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1750         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1751       } else { // If L was computed on device, we also compute the transpose there
1752         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1753         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1754                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1755                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1756                                                                                                                                                                                                                                factors->aLt_d));
1757         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1758       }
1759       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1760     }
1761 
1762     factors->transpose_updated = PETSC_TRUE;
1763   }
1764   PetscFunctionReturn(PETSC_SUCCESS);
1765 }
1766 
1767 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1768 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1769 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1770 {
1771   auto                        exec    = PetscGetKokkosExecutionSpace();
1772   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1773   PetscInt                    m       = A->rmap->n;
1774   PetscScalarKokkosView       D       = factors->D_d;
1775   PetscScalarKokkosView       X, Y, B; // alias
1776   ConstPetscScalarKokkosView  b;
1777   PetscScalarKokkosView       x;
1778   PetscIntKokkosView         &rowperm  = factors->rowperm;
1779   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1780 
1781   PetscFunctionBegin;
1782   PetscCall(PetscLogGpuTimeBegin());
1783   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1784   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1785   PetscCall(VecGetKokkosView(bb, &b));
1786   PetscCall(VecGetKokkosViewWrite(xx, &x));
1787 
1788   // Solve U^T Y = B
1789   if (identity) { // Reorder b with the row permutation
1790     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1791     Y = factors->workVector;
1792   } else {
1793     B = factors->workVector;
1794     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1795     Y = x;
1796   }
1797   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1798 
1799   // Solve diag(D) Y' = Y.
1800   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1801   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1802 
1803   // Solve UX = Y
1804   if (identity) {
1805     X = x;
1806   } else {
1807     X = factors->workVector; // B is not needed anymore
1808   }
1809   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1810 
1811   // Reorder X with the inverse column (row) permutation
1812   if (!identity) {
1813     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1814   }
1815 
1816   PetscCall(VecRestoreKokkosView(bb, &b));
1817   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1818   PetscCall(PetscLogGpuTimeEnd());
1819   PetscFunctionReturn(PETSC_SUCCESS);
1820 }
1821 
1822 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1823 // R and C are represented by rowperm and colperm in factors.
1824 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1825 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1826 {
1827   auto                        exec    = PetscGetKokkosExecutionSpace();
1828   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1829   PetscInt                    m       = A->rmap->n;
1830   PetscScalarKokkosView       X, Y, B; // alias
1831   ConstPetscScalarKokkosView  b;
1832   PetscScalarKokkosView       x;
1833   PetscIntKokkosView         &rowperm      = factors->rowperm;
1834   PetscIntKokkosView         &colperm      = factors->colperm;
1835   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1836   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1837 
1838   PetscFunctionBegin;
1839   PetscCall(PetscLogGpuTimeBegin());
1840   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1841   PetscCall(VecGetKokkosView(bb, &b));
1842   PetscCall(VecGetKokkosViewWrite(xx, &x));
1843 
1844   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1845   if (row_identity) {
1846     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1847     Y = factors->workVector;
1848   } else {
1849     B = factors->workVector;
1850     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1851     Y = x;
1852   }
1853   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1854 
1855   // Solve U C^- x = Y
1856   if (col_identity) {
1857     X = x;
1858   } else {
1859     X = factors->workVector;
1860   }
1861   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1862 
1863   // x = C X; Reorder X with the inverse col permutation
1864   if (!col_identity) {
1865     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1866   }
1867 
1868   PetscCall(VecRestoreKokkosView(bb, &b));
1869   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1870   PetscCall(PetscLogGpuTimeEnd());
1871   PetscFunctionReturn(PETSC_SUCCESS);
1872 }
1873 
1874 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1875 // R and C are represented by rowperm and colperm in factors.
1876 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1877 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1878 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1879 {
1880   auto                        exec    = PetscGetKokkosExecutionSpace();
1881   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1882   PetscInt                    m       = A->rmap->n;
1883   PetscScalarKokkosView       X, Y, B; // alias
1884   ConstPetscScalarKokkosView  b;
1885   PetscScalarKokkosView       x;
1886   PetscIntKokkosView         &rowperm      = factors->rowperm;
1887   PetscIntKokkosView         &colperm      = factors->colperm;
1888   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1889   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1890 
1891   PetscFunctionBegin;
1892   PetscCall(PetscLogGpuTimeBegin());
1893   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1894   PetscCall(VecGetKokkosView(bb, &b));
1895   PetscCall(VecGetKokkosViewWrite(xx, &x));
1896 
1897   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1898   if (col_identity) { // Reorder b with the col permutation
1899     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1900     Y = factors->workVector;
1901   } else {
1902     B = factors->workVector;
1903     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1904     Y = x;
1905   }
1906   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1907 
1908   // Solve L^T X = Y
1909   if (row_identity) {
1910     X = x;
1911   } else {
1912     X = factors->workVector;
1913   }
1914   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1915 
1916   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1917   if (!row_identity) {
1918     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1919   }
1920 
1921   PetscCall(VecRestoreKokkosView(bb, &b));
1922   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1923   PetscCall(PetscLogGpuTimeEnd());
1924   PetscFunctionReturn(PETSC_SUCCESS);
1925 }
1926 
1927 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1928 {
1929   PetscFunctionBegin;
1930   PetscCall(MatSeqAIJKokkosSyncHost(A));
1931   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1932 
1933   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1934     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1935     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1936     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1937     const MatScalar            *Ba = b->a;
1938     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1939 
1940     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1941       // Allocate memory and copy the structure
1942       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1943       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1944       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1945       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1946       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1947       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1948 
1949       PetscInt *Li = factors->iL_h.data();
1950       PetscInt *Lj = factors->jL_h.data();
1951       PetscInt *Ui = factors->iU_h.data();
1952       PetscInt *Uj = factors->jU_h.data();
1953 
1954       Li[0] = Ui[0] = 0;
1955       for (PetscInt i = 0; i < m; i++) {
1956         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1957         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1958 
1959         PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1960         Lj[Li[i] + llen] = i;                        // diagonal entry of L
1961 
1962         Uj[Ui[i]] = i;                                                  // diagonal entry of U
1963         PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on  the right of the diagonal
1964 
1965         Li[i + 1] = Li[i] + llen + 1;
1966         Ui[i + 1] = Ui[i] + ulen;
1967       }
1968 
1969       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1970       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1971       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1972       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1973       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1974       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1975 
1976       // Copy row/col permutation to device
1977       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1978       PetscBool row_identity;
1979       PetscCall(ISIdentity(rowperm, &row_identity));
1980       if (!row_identity) {
1981         const PetscInt *ip;
1982 
1983         PetscCall(ISGetIndices(rowperm, &ip));
1984         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1985         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1986         PetscCall(ISRestoreIndices(rowperm, &ip));
1987         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1988       }
1989 
1990       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1991       PetscBool col_identity;
1992       PetscCall(ISIdentity(colperm, &col_identity));
1993       if (!col_identity) {
1994         const PetscInt *ip;
1995 
1996         PetscCall(ISGetIndices(colperm, &ip));
1997         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
1998         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
1999         PetscCall(ISRestoreIndices(colperm, &ip));
2000         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2001       }
2002 
2003       /* Create sptrsv handles for L, U and their transpose */
2004 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2005       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2006 #else
2007       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2008 #endif
2009       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2010       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2011       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2012       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2013     }
2014 
2015     // Copy the value
2016     for (PetscInt i = 0; i < m; i++) {
2017       PetscInt        llen = Bi[i + 1] - Bi[i];
2018       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2019       const PetscInt *Li   = factors->iL_h.data();
2020       const PetscInt *Ui   = factors->iU_h.data();
2021 
2022       PetscScalar *La = factors->aL_h.data();
2023       PetscScalar *Ua = factors->aU_h.data();
2024 
2025       PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2026       La[Li[i] + llen] = 1.0;                      // diagonal entry
2027 
2028       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                 // diagonal entry
2029       PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2030     }
2031 
2032     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2033     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2034     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2035     factors->transpose_updated         = PETSC_FALSE;
2036     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2037 
2038     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2039     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2040   }
2041 
2042   B->ops->matsolve          = NULL;
2043   B->ops->matsolvetranspose = NULL;
2044   PetscFunctionReturn(PETSC_SUCCESS);
2045 }
2046 
2047 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2048 {
2049   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2050   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2051   PetscInt                    fill_lev = info->levels;
2052 
2053   PetscFunctionBegin;
2054   PetscCall(PetscLogGpuTimeBegin());
2055   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2056   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2057 
2058   auto a_d = aijkok->a_dual.view_device();
2059   auto i_d = aijkok->i_dual.view_device();
2060   auto j_d = aijkok->j_dual.view_device();
2061 
2062   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2063 
2064   B->assembled              = PETSC_TRUE;
2065   B->preallocated           = PETSC_TRUE;
2066   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2067   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2068   B->ops->matsolve          = NULL;
2069   B->ops->matsolvetranspose = NULL;
2070 
2071   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2072   factors->transpose_updated         = PETSC_FALSE;
2073   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2074   /* TODO: log flops, but how to know that? */
2075   PetscCall(PetscLogGpuTimeEnd());
2076   PetscFunctionReturn(PETSC_SUCCESS);
2077 }
2078 
2079 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2080 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2081 {
2082   Mat_SeqAIJKokkos           *aijkok;
2083   Mat_SeqAIJ                 *b;
2084   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2085   PetscInt                    fill_lev = info->levels;
2086   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2087   PetscInt                    n        = A->rmap->n;
2088 
2089   PetscFunctionBegin;
2090   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2091   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2092 
2093   /* Create a spiluk handle and then do symbolic factorization */
2094   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2095   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2096 
2097   auto spiluk_handle = factors->kh.get_spiluk_handle();
2098 
2099   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2100   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2101   Kokkos::realloc(factors->iU_d, n + 1);
2102   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2103 
2104   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2105   auto i_d = aijkok->i_dual.view_device();
2106   auto j_d = aijkok->j_dual.view_device();
2107   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2108   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2109 
2110   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2111   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2112   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2113   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2114 
2115   /* TODO: add options to select sptrsv algorithms */
2116   /* Create sptrsv handles for L, U and their transpose */
2117 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2118   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2119 #else
2120   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2121 #endif
2122 
2123   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2124   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2125   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2126   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2127 
2128   /* Fill fields of the factor matrix B */
2129   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2130   b     = (Mat_SeqAIJ *)B->data;
2131   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2132   B->info.fill_ratio_given  = info->fill;
2133   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2134 
2135   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2136   PetscFunctionReturn(PETSC_SUCCESS);
2137 }
2138 
2139 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2140 {
2141   PetscFunctionBegin;
2142   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2143   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2144   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2145   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2146   PetscFunctionReturn(PETSC_SUCCESS);
2147 }
2148 
2149 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2150 {
2151   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2152 
2153   PetscFunctionBegin;
2154   if (!info->factoronhost) {
2155     PetscCall(ISIdentity(isrow, &row_identity));
2156     PetscCall(ISIdentity(iscol, &col_identity));
2157   }
2158 
2159   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2160   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2161 
2162   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2163     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2164   } else {
2165     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2166     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2167   }
2168   PetscFunctionReturn(PETSC_SUCCESS);
2169 }
2170 
2171 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2172 {
2173   PetscFunctionBegin;
2174   PetscCall(MatSeqAIJKokkosSyncHost(A));
2175   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2176 
2177   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2178     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2179     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2180     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2181     const MatScalar            *Ba = b->a;
2182     PetscInt                    m  = B->rmap->n;
2183 
2184     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2185       // Allocate memory and copy the structure
2186       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2187       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2188       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2189       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2190       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2191       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2192 
2193       // Build jU_h from the skewed Aj
2194       PetscInt *Uj = factors->jU_h.data();
2195       for (PetscInt i = 0; i < m; i++) {
2196         PetscInt ulen = Bi[i + 1] - Bi[i];
2197         Uj[Bi[i]]     = i;                                              // diagonal entry
2198         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2199       }
2200 
2201       // Copy iU, jU to device
2202       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2203       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2204 
2205       // Copy row/col permutation to device
2206       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2207       PetscBool row_identity;
2208       PetscCall(ISIdentity(rowperm, &row_identity));
2209       if (!row_identity) {
2210         const PetscInt *ip;
2211 
2212         PetscCall(ISGetIndices(rowperm, &ip));
2213         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2214         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2215         PetscCall(ISRestoreIndices(rowperm, &ip));
2216         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2217       }
2218 
2219       // Create sptrsv handles for U and U^T
2220 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2221       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2222 #else
2223       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2224 #endif
2225       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2226       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2227     }
2228     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2229     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2230     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2231 
2232     // Copy the value
2233     PetscScalar *Ua = factors->aU_h.data();
2234     PetscScalar *D  = factors->D_h.data();
2235     for (PetscInt i = 0; i < m; i++) {
2236       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2237       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2238       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2239     }
2240     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2241     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2242 
2243     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2244     factors->transpose_updated         = PETSC_FALSE;
2245   }
2246 
2247   B->ops->matsolve          = NULL;
2248   B->ops->matsolvetranspose = NULL;
2249   PetscFunctionReturn(PETSC_SUCCESS);
2250 }
2251 
2252 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2253 {
2254   PetscFunctionBegin;
2255   if (info->solveonhost) {
2256     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2257     PetscCall(MatSetType(B, MATSEQSBAIJ));
2258     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2259   }
2260 
2261   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2262 
2263   if (!info->solveonhost) {
2264     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2265     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2266     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2267     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2268   }
2269   PetscFunctionReturn(PETSC_SUCCESS);
2270 }
2271 
2272 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2273 {
2274   PetscFunctionBegin;
2275   if (info->solveonhost) {
2276     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2277     PetscCall(MatSetType(B, MATSEQSBAIJ));
2278     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2279   }
2280 
2281   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2282 
2283   if (!info->solveonhost) {
2284     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2285     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2286     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2287     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2288   }
2289   PetscFunctionReturn(PETSC_SUCCESS);
2290 }
2291 
2292 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2293 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2294 {
2295   PetscFunctionBegin;
2296   *type = MATSOLVERKOKKOS;
2297   PetscFunctionReturn(PETSC_SUCCESS);
2298 }
2299 
2300 /*MC
2301   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2302   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2303 
2304   Level: beginner
2305 
2306 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2307 M*/
2308 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2309 {
2310   PetscInt n = A->rmap->n;
2311   MPI_Comm comm;
2312 
2313   PetscFunctionBegin;
2314   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2315   PetscCall(MatCreate(comm, B));
2316   PetscCall(MatSetSizes(*B, n, n, n, n));
2317   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2318   (*B)->factortype = ftype;
2319   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2320   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2321   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2322 
2323   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2324     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2325     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2326     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2327     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2328     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2329   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2330     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2331     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2332     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2333     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2334   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2335 
2336   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2337   (*B)->canuseordering = PETSC_TRUE;
2338   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2339   PetscFunctionReturn(PETSC_SUCCESS);
2340 }
2341 
2342 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2343 {
2344   PetscFunctionBegin;
2345   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2346   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2347   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2348   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2349   PetscFunctionReturn(PETSC_SUCCESS);
2350 }
2351 
2352 /* Utility to print out a KokkosCsrMatrix for debugging */
2353 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2354 {
2355   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2356   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2357   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2358   const PetscInt    *i  = iv.data();
2359   const PetscInt    *j  = jv.data();
2360   const PetscScalar *a  = av.data();
2361   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2362 
2363   PetscFunctionBegin;
2364   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2365   for (PetscInt k = 0; k < m; k++) {
2366     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2367     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2368     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2369   }
2370   PetscFunctionReturn(PETSC_SUCCESS);
2371 }
2372