xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision e2fbb1ba6e3eace0c4c0036fdbc8a86f123a4fe5)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscpkg_version.h>
4 #include <petsc/private/petscimpl.h>
5 #include <petsc/private/sfimpl.h>
6 #include <petscsystypes.h>
7 #include <petscerror.h>
8 
9 #include <Kokkos_Core.hpp>
10 #include <KokkosBlas.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 #include <KokkosBatched_LU_Decl.hpp>
18 #include <KokkosBatched_InverseLU_Decl.hpp>
19 
20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
21 
22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
23   #include <KokkosSparse_Utils.hpp>
24 using KokkosSparse::sort_crs_matrix;
25 using KokkosSparse::Impl::transpose_matrix;
26 #else
27   #include <KokkosKernels_Sorting.hpp>
28 using KokkosKernels::sort_crs_matrix;
29 using KokkosKernels::Impl::transpose_matrix;
30 #endif
31 
32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
33 
34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
35    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
36    In the latter case, it is important to set a_dual's sync state correctly.
37  */
38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
39 {
40   Mat_SeqAIJ       *aijseq;
41   Mat_SeqAIJKokkos *aijkok;
42 
43   PetscFunctionBegin;
44   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
45   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
46 
47   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
48   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
49 
50   /* If aijkok does not exist, we just copy i, j to device.
51      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
52      In both cases, we build a new aijkok structure.
53   */
54   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
55     delete aijkok;
56     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
57     A->spptr = aijkok;
58   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
59     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
60     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
61     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
62   }
63   PetscFunctionReturn(PETSC_SUCCESS);
64 }
65 
66 /* Sync CSR data to device if not yet */
67 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
68 {
69   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
70 
71   PetscFunctionBegin;
72   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
73   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
74   if (aijkok->a_dual.need_sync_device()) {
75     aijkok->a_dual.sync_device();
76     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
77     aijkok->hermitian_updated = PETSC_FALSE;
78   }
79   PetscFunctionReturn(PETSC_SUCCESS);
80 }
81 
82 /* Mark the CSR data on device as modified */
83 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
84 {
85   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
86 
87   PetscFunctionBegin;
88   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
89   aijkok->a_dual.clear_sync_state();
90   aijkok->a_dual.modify_device();
91   aijkok->transpose_updated = PETSC_FALSE;
92   aijkok->hermitian_updated = PETSC_FALSE;
93   PetscCall(MatSeqAIJInvalidateDiagonal(A));
94   PetscCall(PetscObjectStateIncrease((PetscObject)A));
95   PetscFunctionReturn(PETSC_SUCCESS);
96 }
97 
98 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
99 {
100   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
101   auto             &exec   = PetscGetKokkosExecutionSpace();
102 
103   PetscFunctionBegin;
104   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
105   /* We do not expect one needs factors on host  */
106   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
107   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
108   PetscCallCXX(aijkok->a_dual.sync_host(exec));
109   PetscCallCXX(exec.fence());
110   PetscFunctionReturn(PETSC_SUCCESS);
111 }
112 
113 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
114 {
115   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
116 
117   PetscFunctionBegin;
118   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
119     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
120     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
121     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
122   */
123   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
124     auto &exec = PetscGetKokkosExecutionSpace();
125     PetscCallCXX(aijkok->a_dual.sync_host(exec));
126     PetscCallCXX(exec.fence());
127     *array = aijkok->a_dual.view_host().data();
128   } else { /* Happens when calling MatSetValues on a newly created matrix */
129     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
130   }
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
135 {
136   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
137 
138   PetscFunctionBegin;
139   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
140   PetscFunctionReturn(PETSC_SUCCESS);
141 }
142 
143 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
144 {
145   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
146 
147   PetscFunctionBegin;
148   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149     auto &exec = PetscGetKokkosExecutionSpace();
150     PetscCallCXX(aijkok->a_dual.sync_host(exec));
151     PetscCallCXX(exec.fence());
152     *array = aijkok->a_dual.view_host().data();
153   } else {
154     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
155   }
156   PetscFunctionReturn(PETSC_SUCCESS);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
160 {
161   PetscFunctionBegin;
162   *array = NULL;
163   PetscFunctionReturn(PETSC_SUCCESS);
164 }
165 
166 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
167 {
168   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
169 
170   PetscFunctionBegin;
171   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
172     *array = aijkok->a_dual.view_host().data();
173   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
174     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
175   }
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
180 {
181   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
182 
183   PetscFunctionBegin;
184   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
185     aijkok->a_dual.clear_sync_state();
186     aijkok->a_dual.modify_host();
187   }
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
192 {
193   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
194 
195   PetscFunctionBegin;
196   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
197 
198   if (i) *i = aijkok->i_device_data();
199   if (j) *j = aijkok->j_device_data();
200   if (a) {
201     aijkok->a_dual.sync_device();
202     *a = aijkok->a_device_data();
203   }
204   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
205   PetscFunctionReturn(PETSC_SUCCESS);
206 }
207 
208 /*
209   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
210 
211   Input Parameter:
212 .  A       - the MATSEQAIJKOKKOS matrix
213 
214   Output Parameters:
215 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
216 -  T_d    - the transpose on device, whose value array is allocated but not initialized
217 */
218 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
219 {
220   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
221   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
222   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
223   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
224   MatRowMapType          *Ti = Ti_h.data();
225   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
226   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
227   PetscInt               *Tj   = Tj_h.data();
228   PetscInt               *perm = perm_h.data();
229   PetscInt               *offset;
230 
231   PetscFunctionBegin;
232   // Populate Ti
233   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
234   Ti++;
235   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
236   Ti--;
237   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
238 
239   // Populate Tj and the permutation array
240   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
241   for (PetscInt i = 0; i < m; i++) {
242     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
243       PetscInt r    = Aj[j];                       // row r of T
244       PetscInt disp = Ti[r] + offset[r];
245 
246       Tj[disp]   = i; // col i of T
247       perm[disp] = j;
248       offset[r]++;
249     }
250   }
251   PetscCall(PetscFree(offset));
252 
253   // Sort each row of T, along with the permutation array
254   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
255 
256   // Output perm and T on device
257   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
258   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
259   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
260   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
261   PetscFunctionReturn(PETSC_SUCCESS);
262 }
263 
264 // Generate the transpose on device and cache it internally
265 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
266 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
267 {
268   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
269   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
270   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
271   KokkosCsrMatrix  &T = akok->csrmatT;
272 
273   PetscFunctionBegin;
274   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
275   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
276 
277   const auto &Aa = akok->a_dual.view_device();
278 
279   if (A->symmetric == PETSC_BOOL3_TRUE) {
280     *csrmatT = akok->csrmat;
281   } else {
282     // See if we already have a cached transpose and its value is up to date
283     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
284       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
285         const auto &perm = akok->transpose_perm; // get the permutation array
286         auto       &Ta   = T.values;
287 
288         PetscCallCXX(Kokkos::parallel_for(nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
289       }
290     } else { // Generate T of size n x m for the first time
291       MatRowMapKokkosView perm;
292 
293       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
294       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
295       PetscCallCXX(Kokkos::parallel_for(nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
296     }
297     akok->transpose_updated = PETSC_TRUE;
298     *csrmatT                = akok->csrmatT;
299   }
300   PetscFunctionReturn(PETSC_SUCCESS);
301 }
302 
303 // Generate the Hermitian on device and cache it internally
304 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
305 {
306   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
307   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
308   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
309   KokkosCsrMatrix  &T = akok->csrmatH;
310 
311   PetscFunctionBegin;
312   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
313   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
314 
315   const auto &Aa = akok->a_dual.view_device();
316 
317   if (A->hermitian == PETSC_BOOL3_TRUE) {
318     *csrmatH = akok->csrmat;
319   } else {
320     // See if we already have a cached hermitian and its value is up to date
321     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
322       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
323         const auto &perm = akok->transpose_perm; // get the permutation array
324         auto       &Ta   = T.values;
325 
326         PetscCallCXX(Kokkos::parallel_for(nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
327       }
328     } else { // Generate T of size n x m for the first time
329       MatRowMapKokkosView perm;
330 
331       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
332       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
333       PetscCallCXX(Kokkos::parallel_for(nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
334     }
335     akok->hermitian_updated = PETSC_TRUE;
336     *csrmatH                = akok->csrmatH;
337   }
338   PetscFunctionReturn(PETSC_SUCCESS);
339 }
340 
341 /* y = A x */
342 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
343 {
344   Mat_SeqAIJKokkos          *aijkok;
345   ConstPetscScalarKokkosView xv;
346   PetscScalarKokkosView      yv;
347 
348   PetscFunctionBegin;
349   PetscCall(PetscLogGpuTimeBegin());
350   PetscCall(MatSeqAIJKokkosSyncDevice(A));
351   PetscCall(VecGetKokkosView(xx, &xv));
352   PetscCall(VecGetKokkosViewWrite(yy, &yv));
353   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
354   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
355   PetscCall(VecRestoreKokkosView(xx, &xv));
356   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
357   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
358   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
359   PetscCall(PetscLogGpuTimeEnd());
360   PetscFunctionReturn(PETSC_SUCCESS);
361 }
362 
363 /* y = A^T x */
364 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
365 {
366   Mat_SeqAIJKokkos          *aijkok;
367   const char                *mode;
368   ConstPetscScalarKokkosView xv;
369   PetscScalarKokkosView      yv;
370   KokkosCsrMatrix            csrmat;
371 
372   PetscFunctionBegin;
373   PetscCall(PetscLogGpuTimeBegin());
374   PetscCall(MatSeqAIJKokkosSyncDevice(A));
375   PetscCall(VecGetKokkosView(xx, &xv));
376   PetscCall(VecGetKokkosViewWrite(yy, &yv));
377   if (A->form_explicit_transpose) {
378     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
379     mode = "N";
380   } else {
381     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
382     csrmat = aijkok->csrmat;
383     mode   = "T";
384   }
385   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
386   PetscCall(VecRestoreKokkosView(xx, &xv));
387   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
388   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
389   PetscCall(PetscLogGpuTimeEnd());
390   PetscFunctionReturn(PETSC_SUCCESS);
391 }
392 
393 /* y = A^H x */
394 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
395 {
396   Mat_SeqAIJKokkos          *aijkok;
397   const char                *mode;
398   ConstPetscScalarKokkosView xv;
399   PetscScalarKokkosView      yv;
400   KokkosCsrMatrix            csrmat;
401 
402   PetscFunctionBegin;
403   PetscCall(PetscLogGpuTimeBegin());
404   PetscCall(MatSeqAIJKokkosSyncDevice(A));
405   PetscCall(VecGetKokkosView(xx, &xv));
406   PetscCall(VecGetKokkosViewWrite(yy, &yv));
407   if (A->form_explicit_transpose) {
408     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
409     mode = "N";
410   } else {
411     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
412     csrmat = aijkok->csrmat;
413     mode   = "C";
414   }
415   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
416   PetscCall(VecRestoreKokkosView(xx, &xv));
417   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
418   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
419   PetscCall(PetscLogGpuTimeEnd());
420   PetscFunctionReturn(PETSC_SUCCESS);
421 }
422 
423 /* z = A x + y */
424 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
425 {
426   Mat_SeqAIJKokkos          *aijkok;
427   ConstPetscScalarKokkosView xv, yv;
428   PetscScalarKokkosView      zv;
429 
430   PetscFunctionBegin;
431   PetscCall(PetscLogGpuTimeBegin());
432   PetscCall(MatSeqAIJKokkosSyncDevice(A));
433   PetscCall(VecGetKokkosView(xx, &xv));
434   PetscCall(VecGetKokkosView(yy, &yv));
435   PetscCall(VecGetKokkosViewWrite(zz, &zv));
436   if (zz != yy) Kokkos::deep_copy(zv, yv);
437   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
438   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
439   PetscCall(VecRestoreKokkosView(xx, &xv));
440   PetscCall(VecRestoreKokkosView(yy, &yv));
441   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
442   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
443   PetscCall(PetscLogGpuTimeEnd());
444   PetscFunctionReturn(PETSC_SUCCESS);
445 }
446 
447 /* z = A^T x + y */
448 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
449 {
450   Mat_SeqAIJKokkos          *aijkok;
451   const char                *mode;
452   ConstPetscScalarKokkosView xv, yv;
453   PetscScalarKokkosView      zv;
454   KokkosCsrMatrix            csrmat;
455 
456   PetscFunctionBegin;
457   PetscCall(PetscLogGpuTimeBegin());
458   PetscCall(MatSeqAIJKokkosSyncDevice(A));
459   PetscCall(VecGetKokkosView(xx, &xv));
460   PetscCall(VecGetKokkosView(yy, &yv));
461   PetscCall(VecGetKokkosViewWrite(zz, &zv));
462   if (zz != yy) Kokkos::deep_copy(zv, yv);
463   if (A->form_explicit_transpose) {
464     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
465     mode = "N";
466   } else {
467     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
468     csrmat = aijkok->csrmat;
469     mode   = "T";
470   }
471   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
472   PetscCall(VecRestoreKokkosView(xx, &xv));
473   PetscCall(VecRestoreKokkosView(yy, &yv));
474   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
475   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
476   PetscCall(PetscLogGpuTimeEnd());
477   PetscFunctionReturn(PETSC_SUCCESS);
478 }
479 
480 /* z = A^H x + y */
481 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
482 {
483   Mat_SeqAIJKokkos          *aijkok;
484   const char                *mode;
485   ConstPetscScalarKokkosView xv, yv;
486   PetscScalarKokkosView      zv;
487   KokkosCsrMatrix            csrmat;
488 
489   PetscFunctionBegin;
490   PetscCall(PetscLogGpuTimeBegin());
491   PetscCall(MatSeqAIJKokkosSyncDevice(A));
492   PetscCall(VecGetKokkosView(xx, &xv));
493   PetscCall(VecGetKokkosView(yy, &yv));
494   PetscCall(VecGetKokkosViewWrite(zz, &zv));
495   if (zz != yy) Kokkos::deep_copy(zv, yv);
496   if (A->form_explicit_transpose) {
497     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
498     mode = "N";
499   } else {
500     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
501     csrmat = aijkok->csrmat;
502     mode   = "C";
503   }
504   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
505   PetscCall(VecRestoreKokkosView(xx, &xv));
506   PetscCall(VecRestoreKokkosView(yy, &yv));
507   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
508   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
509   PetscCall(PetscLogGpuTimeEnd());
510   PetscFunctionReturn(PETSC_SUCCESS);
511 }
512 
513 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
514 {
515   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
516 
517   PetscFunctionBegin;
518   switch (op) {
519   case MAT_FORM_EXPLICIT_TRANSPOSE:
520     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
521     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
522     A->form_explicit_transpose = flg;
523     break;
524   default:
525     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
526     break;
527   }
528   PetscFunctionReturn(PETSC_SUCCESS);
529 }
530 
531 /* Depending on reuse, either build a new mat, or use the existing mat */
532 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
533 {
534   Mat_SeqAIJ *aseq;
535 
536   PetscFunctionBegin;
537   PetscCall(PetscKokkosInitializeCheck());
538   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
539     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
540   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
541     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
542   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
543     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
544     PetscCall(PetscFree(A->defaultvectype));
545     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
546     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
547     PetscCall(MatSetOps_SeqAIJKokkos(A));
548     aseq = static_cast<Mat_SeqAIJ *>(A->data);
549     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
550       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
551       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
552     }
553   }
554   PetscFunctionReturn(PETSC_SUCCESS);
555 }
556 
557 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
558    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
559  */
560 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
561 {
562   Mat_SeqAIJ       *bseq;
563   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
564   Mat               mat;
565 
566   PetscFunctionBegin;
567   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
568   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
569   mat = *B;
570   if (A->assembled) {
571     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
572     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
573     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
574     /* Now copy values to B if needed */
575     if (dupOption == MAT_COPY_VALUES) {
576       if (akok->a_dual.need_sync_device()) {
577         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
578         bkok->a_dual.modify_host();
579       } else { /* If device has the latest data, we only copy data on device */
580         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
581         bkok->a_dual.modify_device();
582       }
583     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
584       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
585       bkok->a_dual.modify_host();
586     }
587     mat->spptr = bkok;
588   }
589 
590   PetscCall(PetscFree(mat->defaultvectype));
591   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
592   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
593   PetscCall(MatSetOps_SeqAIJKokkos(mat));
594   PetscFunctionReturn(PETSC_SUCCESS);
595 }
596 
597 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
598 {
599   Mat               At;
600   KokkosCsrMatrix   internT;
601   Mat_SeqAIJKokkos *atkok, *bkok;
602 
603   PetscFunctionBegin;
604   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
605   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
606   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
607     /* Deep copy internT, as we want to isolate the internal transpose */
608     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
609     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
610     if (reuse == MAT_INITIAL_MATRIX) *B = At;
611     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
612   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
613     if ((*B)->assembled) {
614       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
615       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
616       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
617     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
618       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
619       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
620       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
621       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
622       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
623     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
624   }
625   PetscFunctionReturn(PETSC_SUCCESS);
626 }
627 
628 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
629 {
630   Mat_SeqAIJKokkos *aijkok;
631 
632   PetscFunctionBegin;
633   if (A->factortype == MAT_FACTOR_NONE) {
634     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
635     delete aijkok;
636   } else {
637     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
638   }
639   A->spptr = NULL;
640   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
641   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
642   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
643   PetscCall(MatDestroy_SeqAIJ(A));
644   PetscFunctionReturn(PETSC_SUCCESS);
645 }
646 
647 /*MC
648    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
649 
650    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
651 
652    Options Database Key:
653 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
654 
655   Level: beginner
656 
657 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
658 M*/
659 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
660 {
661   PetscFunctionBegin;
662   PetscCall(PetscKokkosInitializeCheck());
663   PetscCall(MatCreate_SeqAIJ(A));
664   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
665   PetscFunctionReturn(PETSC_SUCCESS);
666 }
667 
668 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
669 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
670 {
671   Mat_SeqAIJ         *a, *b;
672   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
673   MatScalarKokkosView aa, ba, ca;
674   MatRowMapKokkosView ai, bi, ci;
675   MatColIdxKokkosView aj, bj, cj;
676   PetscInt            m, n, nnz, aN;
677 
678   PetscFunctionBegin;
679   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
680   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
681   PetscAssertPointer(C, 4);
682   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
683   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
684   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
685   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
686 
687   PetscCall(MatSeqAIJKokkosSyncDevice(A));
688   PetscCall(MatSeqAIJKokkosSyncDevice(B));
689   a    = static_cast<Mat_SeqAIJ *>(A->data);
690   b    = static_cast<Mat_SeqAIJ *>(B->data);
691   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
692   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
693   aa   = akok->a_dual.view_device();
694   ai   = akok->i_dual.view_device();
695   ba   = bkok->a_dual.view_device();
696   bi   = bkok->i_dual.view_device();
697   m    = A->rmap->n; /* M, N and nnz of C */
698   n    = A->cmap->n + B->cmap->n;
699   nnz  = a->nz + b->nz;
700   aN   = A->cmap->n; /* N of A */
701   if (reuse == MAT_INITIAL_MATRIX) {
702     aj           = akok->j_dual.view_device();
703     bj           = bkok->j_dual.view_device();
704     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
705     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
706     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
707     ca           = ca_dual.view_device();
708     ci           = ci_dual.view_device();
709     cj           = cj_dual.view_device();
710 
711     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
712     Kokkos::parallel_for(
713       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
714         PetscInt i       = t.league_rank(); /* row i */
715         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
716 
717         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
718                                                    ci(i) = coffset;
719                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
720         });
721 
722         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
723           if (k < alen) {
724             ca(coffset + k) = aa(ai(i) + k);
725             cj(coffset + k) = aj(ai(i) + k);
726           } else {
727             ca(coffset + k) = ba(bi(i) + k - alen);
728             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
729           }
730         });
731       });
732     ca_dual.modify_device();
733     ci_dual.modify_device();
734     cj_dual.modify_device();
735     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
736     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
737   } else if (reuse == MAT_REUSE_MATRIX) {
738     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
739     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
740     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
741     ca   = ckok->a_dual.view_device();
742     ci   = ckok->i_dual.view_device();
743 
744     Kokkos::parallel_for(
745       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
746         PetscInt i    = t.league_rank(); /* row i */
747         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
748         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
749           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
750           else ca(ci(i) + k) = ba(bi(i) + k - alen);
751         });
752       });
753     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
754   }
755   PetscFunctionReturn(PETSC_SUCCESS);
756 }
757 
758 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
759 {
760   PetscFunctionBegin;
761   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
762   PetscFunctionReturn(PETSC_SUCCESS);
763 }
764 
765 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
766 {
767   Mat_Product                 *product = C->product;
768   Mat                          A, B;
769   bool                         transA, transB; /* use bool, since KK needs this type */
770   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
771   Mat_SeqAIJ                  *c;
772   MatProductData_SeqAIJKokkos *pdata;
773   KokkosCsrMatrix              csrmatA, csrmatB;
774 
775   PetscFunctionBegin;
776   MatCheckProduct(C, 1);
777   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
778   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
779 
780   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
781   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
782   // we still do numeric.
783   if (pdata->reusesym) { // numeric reuses results from symbolic
784     pdata->reusesym = PETSC_FALSE;
785     PetscFunctionReturn(PETSC_SUCCESS);
786   }
787 
788   switch (product->type) {
789   case MATPRODUCT_AB:
790     transA = false;
791     transB = false;
792     break;
793   case MATPRODUCT_AtB:
794     transA = true;
795     transB = false;
796     break;
797   case MATPRODUCT_ABt:
798     transA = false;
799     transB = true;
800     break;
801   default:
802     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
803   }
804 
805   A = product->A;
806   B = product->B;
807   PetscCall(MatSeqAIJKokkosSyncDevice(A));
808   PetscCall(MatSeqAIJKokkosSyncDevice(B));
809   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
810   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
811   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
812 
813   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
814 
815   csrmatA = akok->csrmat;
816   csrmatB = bkok->csrmat;
817 
818   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
819   if (transA) {
820     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
821     transA = false;
822   }
823 
824   if (transB) {
825     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
826     transB = false;
827   }
828   PetscCall(PetscLogGpuTimeBegin());
829   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
830 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
831   auto spgemmHandle = pdata->kh.get_spgemm_handle();
832   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
833 #endif
834 
835   PetscCall(PetscLogGpuTimeEnd());
836   PetscCall(MatSeqAIJKokkosModifyDevice(C));
837   /* shorter version of MatAssemblyEnd_SeqAIJ */
838   c = (Mat_SeqAIJ *)C->data;
839   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
840   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
841   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
842   c->reallocs         = 0;
843   C->info.mallocs     = 0;
844   C->info.nz_unneeded = 0;
845   C->assembled = C->was_assembled = PETSC_TRUE;
846   C->num_ass++;
847   PetscFunctionReturn(PETSC_SUCCESS);
848 }
849 
850 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
851 {
852   Mat_Product                 *product = C->product;
853   MatProductType               ptype;
854   Mat                          A, B;
855   bool                         transA, transB;
856   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
857   MatProductData_SeqAIJKokkos *pdata;
858   MPI_Comm                     comm;
859   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
860 
861   PetscFunctionBegin;
862   MatCheckProduct(C, 1);
863   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
864   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
865   A = product->A;
866   B = product->B;
867   PetscCall(MatSeqAIJKokkosSyncDevice(A));
868   PetscCall(MatSeqAIJKokkosSyncDevice(B));
869   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
870   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
871   csrmatA = akok->csrmat;
872   csrmatB = bkok->csrmat;
873 
874   ptype = product->type;
875   // Take advantage of the symmetry if true
876   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
877     ptype                                          = MATPRODUCT_AB;
878     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
879   }
880   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
881     ptype                                          = MATPRODUCT_AB;
882     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
883   }
884 
885   switch (ptype) {
886   case MATPRODUCT_AB:
887     transA = false;
888     transB = false;
889     break;
890   case MATPRODUCT_AtB:
891     transA = true;
892     transB = false;
893     break;
894   case MATPRODUCT_ABt:
895     transA = false;
896     transB = true;
897     break;
898   default:
899     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
900   }
901   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
902   pdata->reusesym = product->api_user;
903 
904   /* TODO: add command line options to select spgemm algorithms */
905   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
906 
907   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
908 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
909   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
910   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
911   #endif
912 #endif
913   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
914 
915   PetscCall(PetscLogGpuTimeBegin());
916   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
917   if (transA) {
918     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
919     transA = false;
920   }
921 
922   if (transB) {
923     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
924     transB = false;
925   }
926 
927   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
928   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
929     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
930     calling new Mat_SeqAIJKokkos().
931     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
932   */
933   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
934 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
935   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
936   auto spgemmHandle = pdata->kh.get_spgemm_handle();
937   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
938 #endif
939   PetscCall(PetscLogGpuTimeEnd());
940 
941   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
942   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
943   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
944   PetscFunctionReturn(PETSC_SUCCESS);
945 }
946 
947 /* handles sparse matrix matrix ops */
948 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
949 {
950   Mat_Product *product = mat->product;
951   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
952 
953   PetscFunctionBegin;
954   MatCheckProduct(mat, 1);
955   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
956   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
957   if (Biskok && Ciskok) {
958     switch (product->type) {
959     case MATPRODUCT_AB:
960     case MATPRODUCT_AtB:
961     case MATPRODUCT_ABt:
962       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
963       break;
964     case MATPRODUCT_PtAP:
965     case MATPRODUCT_RARt:
966     case MATPRODUCT_ABC:
967       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
968       break;
969     default:
970       break;
971     }
972   } else { /* fallback for AIJ */
973     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
974   }
975   PetscFunctionReturn(PETSC_SUCCESS);
976 }
977 
978 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
979 {
980   Mat_SeqAIJKokkos *aijkok;
981 
982   PetscFunctionBegin;
983   PetscCall(PetscLogGpuTimeBegin());
984   PetscCall(MatSeqAIJKokkosSyncDevice(A));
985   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
986   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
987   PetscCall(MatSeqAIJKokkosModifyDevice(A));
988   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
989   PetscCall(PetscLogGpuTimeEnd());
990   PetscFunctionReturn(PETSC_SUCCESS);
991 }
992 
993 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
994 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
995 {
996   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
997 
998   PetscFunctionBegin;
999   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1000     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1001 
1002     PetscCall(PetscLogGpuTimeBegin());
1003     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1004     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1005     const auto &Aa     = aijkok->a_dual.view_device();
1006     const auto &Adiag  = aijkok->diag_dual.view_device();
1007     PetscCallCXX(Kokkos::parallel_for(n, KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1008     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1009     PetscCall(PetscLogGpuFlops(n));
1010     PetscCall(PetscLogGpuTimeEnd());
1011   } else { // need reassembly, very slow!
1012     PetscCall(MatShift_Basic(A, a));
1013   }
1014   PetscFunctionReturn(PETSC_SUCCESS);
1015 }
1016 
1017 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1018 {
1019   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1020 
1021   PetscFunctionBegin;
1022   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1023     ConstPetscScalarKokkosView dv;
1024     PetscInt                   n, nv;
1025 
1026     PetscCall(PetscLogGpuTimeBegin());
1027     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1028     PetscCall(VecGetKokkosView(D, &dv));
1029     PetscCall(VecGetLocalSize(D, &nv));
1030     n = PetscMin(Y->rmap->n, Y->cmap->n);
1031     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1032 
1033     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1034     const auto &Aa     = aijkok->a_dual.view_device();
1035     const auto &Adiag  = aijkok->diag_dual.view_device();
1036     PetscCallCXX(Kokkos::parallel_for(
1037       n, KOKKOS_LAMBDA(const PetscInt i) {
1038         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1039         else Aa(Adiag(i)) += dv(i);
1040       }));
1041     PetscCall(VecRestoreKokkosView(D, &dv));
1042     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1043     PetscCall(PetscLogGpuFlops(n));
1044     PetscCall(PetscLogGpuTimeEnd());
1045   } else { // need reassembly, very slow!
1046     PetscCall(MatDiagonalSet_Default(Y, D, is));
1047   }
1048   PetscFunctionReturn(PETSC_SUCCESS);
1049 }
1050 
1051 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1052 {
1053   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1054   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1055   ConstPetscScalarKokkosView lv, rv;
1056 
1057   PetscFunctionBegin;
1058   PetscCall(PetscLogGpuTimeBegin());
1059   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1060   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1061   const auto &Aa     = aijkok->a_dual.view_device();
1062   const auto &Ai     = aijkok->i_dual.view_device();
1063   const auto &Aj     = aijkok->j_dual.view_device();
1064   if (ll) {
1065     PetscCall(VecGetLocalSize(ll, &m));
1066     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1067     PetscCall(VecGetKokkosView(ll, &lv));
1068     PetscCallCXX(Kokkos::parallel_for( // for each row
1069       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1070         PetscInt i   = t.league_rank(); // row i
1071         PetscInt len = Ai(i + 1) - Ai(i);
1072         // scale entries on the row
1073         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1074       }));
1075     PetscCall(VecRestoreKokkosView(ll, &lv));
1076     PetscCall(PetscLogGpuFlops(nz));
1077   }
1078   if (rr) {
1079     PetscCall(VecGetLocalSize(rr, &n));
1080     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1081     PetscCall(VecGetKokkosView(rr, &rv));
1082     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1083       nz, KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1084     PetscCall(VecRestoreKokkosView(rr, &lv));
1085     PetscCall(PetscLogGpuFlops(nz));
1086   }
1087   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1088   PetscCall(PetscLogGpuTimeEnd());
1089   PetscFunctionReturn(PETSC_SUCCESS);
1090 }
1091 
1092 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1093 {
1094   Mat_SeqAIJKokkos *aijkok;
1095 
1096   PetscFunctionBegin;
1097   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1098   if (aijkok) { /* Only zero the device if data is already there */
1099     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
1100     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1101   } else { /* Might be preallocated but not assembled */
1102     PetscCall(MatZeroEntries_SeqAIJ(A));
1103   }
1104   PetscFunctionReturn(PETSC_SUCCESS);
1105 }
1106 
1107 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1108 {
1109   Mat_SeqAIJKokkos     *aijkok;
1110   PetscInt              n;
1111   PetscScalarKokkosView xv;
1112 
1113   PetscFunctionBegin;
1114   PetscCall(VecGetLocalSize(x, &n));
1115   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1116   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1117 
1118   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1119   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1120 
1121   const auto &Aa    = aijkok->a_dual.view_device();
1122   const auto &Ai    = aijkok->i_dual.view_device();
1123   const auto &Adiag = aijkok->diag_dual.view_device();
1124 
1125   PetscCall(VecGetKokkosViewWrite(x, &xv));
1126   Kokkos::parallel_for(
1127     n, KOKKOS_LAMBDA(const PetscInt i) {
1128       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1129       else xv(i) = 0;
1130     });
1131   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1132   PetscFunctionReturn(PETSC_SUCCESS);
1133 }
1134 
1135 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1136 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1137 {
1138   Mat_SeqAIJKokkos *aijkok;
1139 
1140   PetscFunctionBegin;
1141   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1142   PetscAssertPointer(kv, 2);
1143   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1144   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1145   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1146   *kv    = aijkok->a_dual.view_device();
1147   PetscFunctionReturn(PETSC_SUCCESS);
1148 }
1149 
1150 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1151 {
1152   PetscFunctionBegin;
1153   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1154   PetscAssertPointer(kv, 2);
1155   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1156   PetscFunctionReturn(PETSC_SUCCESS);
1157 }
1158 
1159 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1160 {
1161   Mat_SeqAIJKokkos *aijkok;
1162 
1163   PetscFunctionBegin;
1164   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1165   PetscAssertPointer(kv, 2);
1166   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1167   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1168   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1169   *kv    = aijkok->a_dual.view_device();
1170   PetscFunctionReturn(PETSC_SUCCESS);
1171 }
1172 
1173 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1174 {
1175   PetscFunctionBegin;
1176   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1177   PetscAssertPointer(kv, 2);
1178   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1179   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1180   PetscFunctionReturn(PETSC_SUCCESS);
1181 }
1182 
1183 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1184 {
1185   Mat_SeqAIJKokkos *aijkok;
1186 
1187   PetscFunctionBegin;
1188   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1189   PetscAssertPointer(kv, 2);
1190   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1191   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1192   *kv    = aijkok->a_dual.view_device();
1193   PetscFunctionReturn(PETSC_SUCCESS);
1194 }
1195 
1196 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1197 {
1198   PetscFunctionBegin;
1199   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1200   PetscAssertPointer(kv, 2);
1201   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1202   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1203   PetscFunctionReturn(PETSC_SUCCESS);
1204 }
1205 
1206 /* Computes Y += alpha X */
1207 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1208 {
1209   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1210   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1211   ConstMatScalarKokkosView Xa;
1212   MatScalarKokkosView      Ya;
1213 
1214   PetscFunctionBegin;
1215   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1216   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1217   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1218   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1219   PetscCall(PetscLogGpuTimeBegin());
1220 
1221   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1222     PetscBool e;
1223     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1224     if (e) {
1225       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1226       if (e) pattern = SAME_NONZERO_PATTERN;
1227     }
1228   }
1229 
1230   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1231     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1232     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1233     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1234   */
1235   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1236   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1237   Xa   = xkok->a_dual.view_device();
1238   Ya   = ykok->a_dual.view_device();
1239 
1240   if (pattern == SAME_NONZERO_PATTERN) {
1241     KokkosBlas::axpy(alpha, Xa, Ya);
1242     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1243   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1244     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1245     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1246 
1247     Kokkos::parallel_for(
1248       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1249         PetscInt i = t.league_rank(); // row i
1250         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1251           // Only one thread works in a team
1252           PetscInt p, q = Yi(i);
1253           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1254             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1255             if (Xj(p) == Yj(q)) {                        // Found it
1256               Ya(q) += alpha * Xa(p);
1257               q++;
1258             } else {
1259             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1260             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1261 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1262               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1263 #else
1264               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1265 #endif
1266             }
1267           }
1268         });
1269       });
1270     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1271   } else { // different nonzero patterns
1272     Mat             Z;
1273     KokkosCsrMatrix zcsr;
1274     KernelHandle    kh;
1275     kh.create_spadd_handle(true); // X, Y are sorted
1276     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1277     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1278     zkok = new Mat_SeqAIJKokkos(zcsr);
1279     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1280     PetscCall(MatHeaderReplace(Y, &Z));
1281     kh.destroy_spadd_handle();
1282   }
1283   PetscCall(PetscLogGpuTimeEnd());
1284   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1285   PetscFunctionReturn(PETSC_SUCCESS);
1286 }
1287 
1288 struct MatCOOStruct_SeqAIJKokkos {
1289   PetscCount           n;
1290   PetscCount           Atot;
1291   PetscInt             nz;
1292   PetscCountKokkosView jmap;
1293   PetscCountKokkosView perm;
1294 
1295   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1296   {
1297     nz   = coo_h->nz;
1298     n    = coo_h->n;
1299     Atot = coo_h->Atot;
1300     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1301     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1302   }
1303 };
1304 
1305 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
1306 {
1307   PetscFunctionBegin;
1308   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
1309   PetscFunctionReturn(PETSC_SUCCESS);
1310 }
1311 
1312 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1313 {
1314   Mat_SeqAIJKokkos          *akok;
1315   Mat_SeqAIJ                *aseq;
1316   PetscContainer             container_h, container_d;
1317   MatCOOStruct_SeqAIJ       *coo_h;
1318   MatCOOStruct_SeqAIJKokkos *coo_d;
1319 
1320   PetscFunctionBegin;
1321   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1322   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1323   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1324   delete akok;
1325   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1326   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1327 
1328   // Copy the COO struct to device
1329   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1330   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1331   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1332 
1333   // Put the COO struct in a container and then attach that to the matrix
1334   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
1335   PetscCall(PetscContainerSetPointer(container_d, coo_d));
1336   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos));
1337   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
1338   PetscCall(PetscContainerDestroy(&container_d));
1339   PetscFunctionReturn(PETSC_SUCCESS);
1340 }
1341 
1342 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1343 {
1344   MatScalarKokkosView        Aa;
1345   ConstMatScalarKokkosView   kv;
1346   PetscMemType               memtype;
1347   PetscContainer             container;
1348   MatCOOStruct_SeqAIJKokkos *coo;
1349 
1350   PetscFunctionBegin;
1351   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1352   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1353 
1354   const auto &n    = coo->n;
1355   const auto &Annz = coo->nz;
1356   const auto &jmap = coo->jmap;
1357   const auto &perm = coo->perm;
1358 
1359   PetscCall(PetscGetMemType(v, &memtype));
1360   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1361     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1362   } else {
1363     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1364   }
1365 
1366   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1367   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1368 
1369   PetscCall(PetscLogGpuTimeBegin());
1370   Kokkos::parallel_for(
1371     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1372       PetscScalar sum = 0.0;
1373       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1374       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1375     });
1376   PetscCall(PetscLogGpuTimeEnd());
1377 
1378   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1379   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1380   PetscFunctionReturn(PETSC_SUCCESS);
1381 }
1382 
1383 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1384 {
1385   PetscFunctionBegin;
1386   PetscCall(MatSeqAIJKokkosSyncHost(A));
1387   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1388   B->offloadmask = PETSC_OFFLOAD_CPU;
1389   PetscFunctionReturn(PETSC_SUCCESS);
1390 }
1391 
1392 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1393 {
1394   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1395 
1396   PetscFunctionBegin;
1397   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1398   A->boundtocpu  = PETSC_FALSE;
1399 
1400   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1401   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1402   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1403   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1404   A->ops->scale                     = MatScale_SeqAIJKokkos;
1405   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1406   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1407   A->ops->mult                      = MatMult_SeqAIJKokkos;
1408   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1409   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1410   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1411   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1412   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1413   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1414   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1415   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1416   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1417   A->ops->shift                     = MatShift_SeqAIJKokkos;
1418   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1419   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1420   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1421   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1422   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1423   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1424   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1425   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1426   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1427 
1428   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1429   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1430   PetscFunctionReturn(PETSC_SUCCESS);
1431 }
1432 
1433 /*
1434    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1435 
1436   Input Parameters:
1437 +  A       - the MATSEQAIJKOKKOS matrix
1438 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1439 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1440 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1441 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1442 
1443   Output Parameter:
1444 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1445 */
1446 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1447 {
1448   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1449   PetscInt          nblocks = bs.extent(0) - 1;
1450 
1451   PetscFunctionBegin;
1452   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1453 
1454   // Pull out the diagonal blocks of the matrix and then invert the blocks
1455   auto Aa    = akok->a_dual.view_device();
1456   auto Ai    = akok->i_dual.view_device();
1457   auto Aj    = akok->j_dual.view_device();
1458   auto Adiag = akok->diag_dual.view_device();
1459   // TODO: how to tune the team size?
1460 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
1461   auto ts = Kokkos::AUTO();
1462 #else
1463   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1464 #endif
1465   PetscCallCXX(Kokkos::parallel_for(
1466     Kokkos::TeamPolicy<>(nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1467       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1468       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1469       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1470       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1471       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1472 
1473       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1474         PetscInt i = rstart + r;                                                            // i-th row in A
1475 
1476         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1477           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1478 
1479           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1480             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1481               B(r, c) = 0.0;
1482             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1483               B(r, c) = Aa(first + c);
1484             } else { // this entry does not show up in the CSR
1485               B(r, c) = 0.0;
1486             }
1487           }
1488         } else { // rare case that the diagonal does not exist
1489           const PetscInt begin = Ai(i);
1490           const PetscInt end   = Ai(i + 1);
1491           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1492           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1493             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1494             else if (Aj(j) >= rstart + m) break;
1495           }
1496         }
1497       });
1498 
1499       // LU-decompose B (w/o pivoting) and then invert B
1500       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1501       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1502     }));
1503   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1504   PetscFunctionReturn(PETSC_SUCCESS);
1505 }
1506 
1507 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1508 {
1509   Mat_SeqAIJ *aseq;
1510   PetscInt    i, m, n;
1511   auto       &exec = PetscGetKokkosExecutionSpace();
1512 
1513   PetscFunctionBegin;
1514   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1515 
1516   m = akok->nrows();
1517   n = akok->ncols();
1518   PetscCall(MatSetSizes(A, m, n, m, n));
1519   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1520 
1521   /* Set up data structures of A as a MATSEQAIJ */
1522   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1523   aseq = (Mat_SeqAIJ *)(A)->data;
1524 
1525   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1526   PetscCallCXX(akok->j_dual.sync_host(exec));
1527   PetscCallCXX(exec.fence());
1528 
1529   aseq->i            = akok->i_host_data();
1530   aseq->j            = akok->j_host_data();
1531   aseq->a            = akok->a_host_data();
1532   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1533   aseq->singlemalloc = PETSC_FALSE;
1534   aseq->free_a       = PETSC_FALSE;
1535   aseq->free_ij      = PETSC_FALSE;
1536   aseq->nz           = akok->nnz();
1537   aseq->maxnz        = aseq->nz;
1538 
1539   PetscCall(PetscMalloc1(m, &aseq->imax));
1540   PetscCall(PetscMalloc1(m, &aseq->ilen));
1541   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1542 
1543   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1544   akok->nonzerostate = A->nonzerostate;
1545   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1546   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1547   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1548   PetscFunctionReturn(PETSC_SUCCESS);
1549 }
1550 
1551 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1552 {
1553   PetscFunctionBegin;
1554   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1555   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1556   PetscFunctionReturn(PETSC_SUCCESS);
1557 }
1558 
1559 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1560 {
1561   Mat_SeqAIJKokkos *akok;
1562 
1563   PetscFunctionBegin;
1564   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1565   PetscCall(MatCreate(comm, A));
1566   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1567   PetscFunctionReturn(PETSC_SUCCESS);
1568 }
1569 
1570 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1571 
1572    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1573  */
1574 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1575 {
1576   PetscFunctionBegin;
1577   PetscCall(MatCreate(comm, A));
1578   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1579   PetscFunctionReturn(PETSC_SUCCESS);
1580 }
1581 
1582 /*@C
1583   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1584   (the default parallel PETSc format). This matrix will ultimately be handled by
1585   Kokkos for calculations.
1586 
1587   Collective
1588 
1589   Input Parameters:
1590 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1591 . m    - number of rows
1592 . n    - number of columns
1593 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1594 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1595 
1596   Output Parameter:
1597 . A - the matrix
1598 
1599   Level: intermediate
1600 
1601   Notes:
1602   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1603   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1604   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1605 
1606   The AIJ format, also called
1607   compressed row storage, is fully compatible with standard Fortran
1608   storage.  That is, the stored row and column indices can begin at
1609   either one (as in Fortran) or zero.
1610 
1611   Specify the preallocated storage with either `nz` or `nnz` (not both).
1612   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1613   allocation.
1614 
1615 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1616 @*/
1617 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1618 {
1619   PetscFunctionBegin;
1620   PetscCall(PetscKokkosInitializeCheck());
1621   PetscCall(MatCreate(comm, A));
1622   PetscCall(MatSetSizes(*A, m, n, m, n));
1623   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1624   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1625   PetscFunctionReturn(PETSC_SUCCESS);
1626 }
1627 
1628 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1629 {
1630   PetscFunctionBegin;
1631   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1632   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1633   PetscFunctionReturn(PETSC_SUCCESS);
1634 }
1635 
1636 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1637 {
1638   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1639 
1640   PetscFunctionBegin;
1641   if (!factors->sptrsv_symbolic_completed) {
1642     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1643     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1644     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1645   }
1646   PetscFunctionReturn(PETSC_SUCCESS);
1647 }
1648 
1649 /* Check if we need to update factors etc for transpose solve */
1650 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1651 {
1652   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1653   MatColIdxType               n       = A->rmap->n;
1654 
1655   PetscFunctionBegin;
1656   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1657     /* Update L^T and do sptrsv symbolic */
1658     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1659     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1660     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1661 
1662     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1663                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1664 
1665     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1666       We have to sort the indices, until KK provides finer control options.
1667     */
1668     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1669 
1670     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1671 
1672     /* Update U^T and do sptrsv symbolic */
1673     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1674     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1675     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1676 
1677     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1678                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1679 
1680     /* Sort indices. See comments above */
1681     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1682 
1683     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1684     factors->transpose_updated = PETSC_TRUE;
1685   }
1686   PetscFunctionReturn(PETSC_SUCCESS);
1687 }
1688 
1689 /* Solve Ax = b, with A = LU */
1690 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1691 {
1692   ConstPetscScalarKokkosView  bv;
1693   PetscScalarKokkosView       xv;
1694   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1695 
1696   PetscFunctionBegin;
1697   PetscCall(PetscLogGpuTimeBegin());
1698   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1699   PetscCall(VecGetKokkosView(b, &bv));
1700   PetscCall(VecGetKokkosViewWrite(x, &xv));
1701   /* Solve L tmpv = b */
1702   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1703   /* Solve Ux = tmpv */
1704   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1705   PetscCall(VecRestoreKokkosView(b, &bv));
1706   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1707   PetscCall(PetscLogGpuTimeEnd());
1708   PetscFunctionReturn(PETSC_SUCCESS);
1709 }
1710 
1711 /* Solve A^T x = b, where A^T = U^T L^T */
1712 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1713 {
1714   ConstPetscScalarKokkosView  bv;
1715   PetscScalarKokkosView       xv;
1716   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1717 
1718   PetscFunctionBegin;
1719   PetscCall(PetscLogGpuTimeBegin());
1720   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1721   PetscCall(VecGetKokkosView(b, &bv));
1722   PetscCall(VecGetKokkosViewWrite(x, &xv));
1723   /* Solve U^T tmpv = b */
1724   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1725 
1726   /* Solve L^T x = tmpv */
1727   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1728   PetscCall(VecRestoreKokkosView(b, &bv));
1729   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1730   PetscCall(PetscLogGpuTimeEnd());
1731   PetscFunctionReturn(PETSC_SUCCESS);
1732 }
1733 
1734 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1735 {
1736   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1737   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1738   PetscInt                    fill_lev = info->levels;
1739 
1740   PetscFunctionBegin;
1741   PetscCall(PetscLogGpuTimeBegin());
1742   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1743 
1744   auto a_d = aijkok->a_dual.view_device();
1745   auto i_d = aijkok->i_dual.view_device();
1746   auto j_d = aijkok->j_dual.view_device();
1747 
1748   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1749 
1750   B->assembled              = PETSC_TRUE;
1751   B->preallocated           = PETSC_TRUE;
1752   B->ops->solve             = MatSolve_SeqAIJKokkos;
1753   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1754   B->ops->matsolve          = NULL;
1755   B->ops->matsolvetranspose = NULL;
1756   B->offloadmask            = PETSC_OFFLOAD_GPU;
1757 
1758   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1759   factors->transpose_updated         = PETSC_FALSE;
1760   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1761   /* TODO: log flops, but how to know that? */
1762   PetscCall(PetscLogGpuTimeEnd());
1763   PetscFunctionReturn(PETSC_SUCCESS);
1764 }
1765 
1766 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1767 {
1768   Mat_SeqAIJKokkos           *aijkok;
1769   Mat_SeqAIJ                 *b;
1770   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1771   PetscInt                    fill_lev = info->levels;
1772   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1773   PetscInt                    n        = A->rmap->n;
1774 
1775   PetscFunctionBegin;
1776   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1777   /* Rebuild factors */
1778   if (factors) {
1779     factors->Destroy();
1780   } /* Destroy the old if it exists */
1781   else {
1782     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1783   }
1784 
1785   /* Create a spiluk handle and then do symbolic factorization */
1786   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1787   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1788 
1789   auto spiluk_handle = factors->kh.get_spiluk_handle();
1790 
1791   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1792   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1793   Kokkos::realloc(factors->iU_d, n + 1);
1794   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1795 
1796   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1797   auto i_d = aijkok->i_dual.view_device();
1798   auto j_d = aijkok->j_dual.view_device();
1799   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1800   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1801 
1802   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1803   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1804   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1805   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1806 
1807   /* TODO: add options to select sptrsv algorithms */
1808   /* Create sptrsv handles for L, U and their transpose */
1809 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1810   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1811 #else
1812   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1813 #endif
1814 
1815   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1816   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1817   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1818   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1819 
1820   /* Fill fields of the factor matrix B */
1821   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1822   b     = (Mat_SeqAIJ *)B->data;
1823   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1824   B->info.fill_ratio_given  = info->fill;
1825   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1826 
1827   B->offloadmask          = PETSC_OFFLOAD_GPU;
1828   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1829   PetscFunctionReturn(PETSC_SUCCESS);
1830 }
1831 
1832 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1833 {
1834   PetscFunctionBegin;
1835   *type = MATSOLVERKOKKOS;
1836   PetscFunctionReturn(PETSC_SUCCESS);
1837 }
1838 
1839 /*MC
1840   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1841   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1842 
1843   Level: beginner
1844 
1845 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1846 M*/
1847 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1848 {
1849   PetscInt n = A->rmap->n;
1850 
1851   PetscFunctionBegin;
1852   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1853   PetscCall(MatSetSizes(*B, n, n, n, n));
1854   (*B)->factortype = ftype;
1855   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1856   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1857 
1858   if (ftype == MAT_FACTOR_LU) {
1859     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1860     (*B)->canuseordering        = PETSC_TRUE;
1861     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1862   } else if (ftype == MAT_FACTOR_ILU) {
1863     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1864     (*B)->canuseordering         = PETSC_FALSE;
1865     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1866   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1867 
1868   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1869   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1870   PetscFunctionReturn(PETSC_SUCCESS);
1871 }
1872 
1873 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1874 {
1875   PetscFunctionBegin;
1876   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1877   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1878   PetscFunctionReturn(PETSC_SUCCESS);
1879 }
1880 
1881 /* Utility to print out a KokkosCsrMatrix for debugging */
1882 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1883 {
1884   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1885   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1886   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1887   const PetscInt    *i  = iv.data();
1888   const PetscInt    *j  = jv.data();
1889   const PetscScalar *a  = av.data();
1890   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1891 
1892   PetscFunctionBegin;
1893   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1894   for (PetscInt k = 0; k < m; k++) {
1895     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1896     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1897     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1898   }
1899   PetscFunctionReturn(PETSC_SUCCESS);
1900 }
1901