xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision f13dfd9ea68e0ddeee984e65c377a1819eab8a8a)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscpkg_version.h>
4 #include <petsc/private/petscimpl.h>
5 #include <petsc/private/sfimpl.h>
6 #include <petscsystypes.h>
7 #include <petscerror.h>
8 
9 #include <Kokkos_Core.hpp>
10 #include <KokkosBlas.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 #include <KokkosBatched_LU_Decl.hpp>
18 #include <KokkosBatched_InverseLU_Decl.hpp>
19 
20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
21 
22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
23   #include <KokkosSparse_Utils.hpp>
24 using KokkosSparse::sort_crs_matrix;
25 using KokkosSparse::Impl::transpose_matrix;
26 #else
27   #include <KokkosKernels_Sorting.hpp>
28 using KokkosKernels::sort_crs_matrix;
29 using KokkosKernels::Impl::transpose_matrix;
30 #endif
31 
32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
33 
34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
35    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
36    In the latter case, it is important to set a_dual's sync state correctly.
37  */
38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
39 {
40   Mat_SeqAIJ       *aijseq;
41   Mat_SeqAIJKokkos *aijkok;
42 
43   PetscFunctionBegin;
44   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
45   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
46 
47   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
48   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
49 
50   /* If aijkok does not exist, we just copy i, j to device.
51      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
52      In both cases, we build a new aijkok structure.
53   */
54   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
55     delete aijkok;
56     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
57     A->spptr = aijkok;
58   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
59     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
60     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
61     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
62   }
63   PetscFunctionReturn(PETSC_SUCCESS);
64 }
65 
66 /* Sync CSR data to device if not yet */
67 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
68 {
69   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
70 
71   PetscFunctionBegin;
72   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
73   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
74   if (aijkok->a_dual.need_sync_device()) {
75     aijkok->a_dual.sync_device();
76     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
77     aijkok->hermitian_updated = PETSC_FALSE;
78   }
79   PetscFunctionReturn(PETSC_SUCCESS);
80 }
81 
82 /* Mark the CSR data on device as modified */
83 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
84 {
85   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
86 
87   PetscFunctionBegin;
88   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
89   aijkok->a_dual.clear_sync_state();
90   aijkok->a_dual.modify_device();
91   aijkok->transpose_updated = PETSC_FALSE;
92   aijkok->hermitian_updated = PETSC_FALSE;
93   PetscCall(MatSeqAIJInvalidateDiagonal(A));
94   PetscCall(PetscObjectStateIncrease((PetscObject)A));
95   PetscFunctionReturn(PETSC_SUCCESS);
96 }
97 
98 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
99 {
100   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
101   auto             &exec   = PetscGetKokkosExecutionSpace();
102 
103   PetscFunctionBegin;
104   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
105   /* We do not expect one needs factors on host  */
106   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
107   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
108   PetscCallCXX(aijkok->a_dual.sync_host(exec));
109   PetscCallCXX(exec.fence());
110   PetscFunctionReturn(PETSC_SUCCESS);
111 }
112 
113 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
114 {
115   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
116 
117   PetscFunctionBegin;
118   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
119     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
120     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
121     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
122   */
123   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
124     auto &exec = PetscGetKokkosExecutionSpace();
125     PetscCallCXX(aijkok->a_dual.sync_host(exec));
126     PetscCallCXX(exec.fence());
127     *array = aijkok->a_dual.view_host().data();
128   } else { /* Happens when calling MatSetValues on a newly created matrix */
129     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
130   }
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
135 {
136   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
137 
138   PetscFunctionBegin;
139   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
140   PetscFunctionReturn(PETSC_SUCCESS);
141 }
142 
143 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
144 {
145   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
146 
147   PetscFunctionBegin;
148   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149     auto &exec = PetscGetKokkosExecutionSpace();
150     PetscCallCXX(aijkok->a_dual.sync_host(exec));
151     PetscCallCXX(exec.fence());
152     *array = aijkok->a_dual.view_host().data();
153   } else {
154     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
155   }
156   PetscFunctionReturn(PETSC_SUCCESS);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
160 {
161   PetscFunctionBegin;
162   *array = NULL;
163   PetscFunctionReturn(PETSC_SUCCESS);
164 }
165 
166 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
167 {
168   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
169 
170   PetscFunctionBegin;
171   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
172     *array = aijkok->a_dual.view_host().data();
173   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
174     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
175   }
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
180 {
181   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
182 
183   PetscFunctionBegin;
184   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
185     aijkok->a_dual.clear_sync_state();
186     aijkok->a_dual.modify_host();
187   }
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
192 {
193   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
194 
195   PetscFunctionBegin;
196   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
197 
198   if (i) *i = aijkok->i_device_data();
199   if (j) *j = aijkok->j_device_data();
200   if (a) {
201     aijkok->a_dual.sync_device();
202     *a = aijkok->a_device_data();
203   }
204   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
205   PetscFunctionReturn(PETSC_SUCCESS);
206 }
207 
208 /*
209   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
210 
211   Input Parameter:
212 .  A       - the MATSEQAIJKOKKOS matrix
213 
214   Output Parameters:
215 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
216 -  T_d    - the transpose on device, whose value array is allocated but not initialized
217 */
218 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
219 {
220   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
221   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
222   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
223   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
224   MatRowMapType          *Ti = Ti_h.data();
225   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
226   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
227   PetscInt               *Tj   = Tj_h.data();
228   PetscInt               *perm = perm_h.data();
229   PetscInt               *offset;
230 
231   PetscFunctionBegin;
232   // Populate Ti
233   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
234   Ti++;
235   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
236   Ti--;
237   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
238 
239   // Populate Tj and the permutation array
240   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
241   for (PetscInt i = 0; i < m; i++) {
242     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
243       PetscInt r    = Aj[j];                       // row r of T
244       PetscInt disp = Ti[r] + offset[r];
245 
246       Tj[disp]   = i; // col i of T
247       perm[disp] = j;
248       offset[r]++;
249     }
250   }
251   PetscCall(PetscFree(offset));
252 
253   // Sort each row of T, along with the permutation array
254   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
255 
256   // Output perm and T on device
257   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
258   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
259   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
260   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
261   PetscFunctionReturn(PETSC_SUCCESS);
262 }
263 
264 // Generate the transpose on device and cache it internally
265 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
266 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
267 {
268   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
269   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
270   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
271   KokkosCsrMatrix  &T = akok->csrmatT;
272 
273   PetscFunctionBegin;
274   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
275   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
276 
277   const auto &Aa = akok->a_dual.view_device();
278 
279   if (A->symmetric == PETSC_BOOL3_TRUE) {
280     *csrmatT = akok->csrmat;
281   } else {
282     // See if we already have a cached transpose and its value is up to date
283     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
284       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
285         const auto &perm = akok->transpose_perm; // get the permutation array
286         auto       &Ta   = T.values;
287 
288         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
289       }
290     } else { // Generate T of size n x m for the first time
291       MatRowMapKokkosView perm;
292 
293       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
294       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
295       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
296     }
297     akok->transpose_updated = PETSC_TRUE;
298     *csrmatT                = akok->csrmatT;
299   }
300   PetscFunctionReturn(PETSC_SUCCESS);
301 }
302 
303 // Generate the Hermitian on device and cache it internally
304 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
305 {
306   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
307   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
308   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
309   KokkosCsrMatrix  &T = akok->csrmatH;
310 
311   PetscFunctionBegin;
312   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
313   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
314 
315   const auto &Aa = akok->a_dual.view_device();
316 
317   if (A->hermitian == PETSC_BOOL3_TRUE) {
318     *csrmatH = akok->csrmat;
319   } else {
320     // See if we already have a cached hermitian and its value is up to date
321     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
322       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
323         const auto &perm = akok->transpose_perm; // get the permutation array
324         auto       &Ta   = T.values;
325 
326         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
327       }
328     } else { // Generate T of size n x m for the first time
329       MatRowMapKokkosView perm;
330 
331       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
332       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
333       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
334     }
335     akok->hermitian_updated = PETSC_TRUE;
336     *csrmatH                = akok->csrmatH;
337   }
338   PetscFunctionReturn(PETSC_SUCCESS);
339 }
340 
341 /* y = A x */
342 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
343 {
344   Mat_SeqAIJKokkos          *aijkok;
345   ConstPetscScalarKokkosView xv;
346   PetscScalarKokkosView      yv;
347 
348   PetscFunctionBegin;
349   PetscCall(PetscLogGpuTimeBegin());
350   PetscCall(MatSeqAIJKokkosSyncDevice(A));
351   PetscCall(VecGetKokkosView(xx, &xv));
352   PetscCall(VecGetKokkosViewWrite(yy, &yv));
353   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
354   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
355   PetscCall(VecRestoreKokkosView(xx, &xv));
356   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
357   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
358   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
359   PetscCall(PetscLogGpuTimeEnd());
360   PetscFunctionReturn(PETSC_SUCCESS);
361 }
362 
363 /* y = A^T x */
364 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
365 {
366   Mat_SeqAIJKokkos          *aijkok;
367   const char                *mode;
368   ConstPetscScalarKokkosView xv;
369   PetscScalarKokkosView      yv;
370   KokkosCsrMatrix            csrmat;
371 
372   PetscFunctionBegin;
373   PetscCall(PetscLogGpuTimeBegin());
374   PetscCall(MatSeqAIJKokkosSyncDevice(A));
375   PetscCall(VecGetKokkosView(xx, &xv));
376   PetscCall(VecGetKokkosViewWrite(yy, &yv));
377   if (A->form_explicit_transpose) {
378     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
379     mode = "N";
380   } else {
381     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
382     csrmat = aijkok->csrmat;
383     mode   = "T";
384   }
385   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
386   PetscCall(VecRestoreKokkosView(xx, &xv));
387   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
388   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
389   PetscCall(PetscLogGpuTimeEnd());
390   PetscFunctionReturn(PETSC_SUCCESS);
391 }
392 
393 /* y = A^H x */
394 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
395 {
396   Mat_SeqAIJKokkos          *aijkok;
397   const char                *mode;
398   ConstPetscScalarKokkosView xv;
399   PetscScalarKokkosView      yv;
400   KokkosCsrMatrix            csrmat;
401 
402   PetscFunctionBegin;
403   PetscCall(PetscLogGpuTimeBegin());
404   PetscCall(MatSeqAIJKokkosSyncDevice(A));
405   PetscCall(VecGetKokkosView(xx, &xv));
406   PetscCall(VecGetKokkosViewWrite(yy, &yv));
407   if (A->form_explicit_transpose) {
408     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
409     mode = "N";
410   } else {
411     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
412     csrmat = aijkok->csrmat;
413     mode   = "C";
414   }
415   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
416   PetscCall(VecRestoreKokkosView(xx, &xv));
417   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
418   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
419   PetscCall(PetscLogGpuTimeEnd());
420   PetscFunctionReturn(PETSC_SUCCESS);
421 }
422 
423 /* z = A x + y */
424 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
425 {
426   Mat_SeqAIJKokkos          *aijkok;
427   ConstPetscScalarKokkosView xv, yv;
428   PetscScalarKokkosView      zv;
429 
430   PetscFunctionBegin;
431   PetscCall(PetscLogGpuTimeBegin());
432   PetscCall(MatSeqAIJKokkosSyncDevice(A));
433   PetscCall(VecGetKokkosView(xx, &xv));
434   PetscCall(VecGetKokkosView(yy, &yv));
435   PetscCall(VecGetKokkosViewWrite(zz, &zv));
436   if (zz != yy) Kokkos::deep_copy(zv, yv);
437   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
438   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
439   PetscCall(VecRestoreKokkosView(xx, &xv));
440   PetscCall(VecRestoreKokkosView(yy, &yv));
441   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
442   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
443   PetscCall(PetscLogGpuTimeEnd());
444   PetscFunctionReturn(PETSC_SUCCESS);
445 }
446 
447 /* z = A^T x + y */
448 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
449 {
450   Mat_SeqAIJKokkos          *aijkok;
451   const char                *mode;
452   ConstPetscScalarKokkosView xv, yv;
453   PetscScalarKokkosView      zv;
454   KokkosCsrMatrix            csrmat;
455 
456   PetscFunctionBegin;
457   PetscCall(PetscLogGpuTimeBegin());
458   PetscCall(MatSeqAIJKokkosSyncDevice(A));
459   PetscCall(VecGetKokkosView(xx, &xv));
460   PetscCall(VecGetKokkosView(yy, &yv));
461   PetscCall(VecGetKokkosViewWrite(zz, &zv));
462   if (zz != yy) Kokkos::deep_copy(zv, yv);
463   if (A->form_explicit_transpose) {
464     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
465     mode = "N";
466   } else {
467     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
468     csrmat = aijkok->csrmat;
469     mode   = "T";
470   }
471   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
472   PetscCall(VecRestoreKokkosView(xx, &xv));
473   PetscCall(VecRestoreKokkosView(yy, &yv));
474   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
475   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
476   PetscCall(PetscLogGpuTimeEnd());
477   PetscFunctionReturn(PETSC_SUCCESS);
478 }
479 
480 /* z = A^H x + y */
481 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
482 {
483   Mat_SeqAIJKokkos          *aijkok;
484   const char                *mode;
485   ConstPetscScalarKokkosView xv, yv;
486   PetscScalarKokkosView      zv;
487   KokkosCsrMatrix            csrmat;
488 
489   PetscFunctionBegin;
490   PetscCall(PetscLogGpuTimeBegin());
491   PetscCall(MatSeqAIJKokkosSyncDevice(A));
492   PetscCall(VecGetKokkosView(xx, &xv));
493   PetscCall(VecGetKokkosView(yy, &yv));
494   PetscCall(VecGetKokkosViewWrite(zz, &zv));
495   if (zz != yy) Kokkos::deep_copy(zv, yv);
496   if (A->form_explicit_transpose) {
497     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
498     mode = "N";
499   } else {
500     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
501     csrmat = aijkok->csrmat;
502     mode   = "C";
503   }
504   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
505   PetscCall(VecRestoreKokkosView(xx, &xv));
506   PetscCall(VecRestoreKokkosView(yy, &yv));
507   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
508   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
509   PetscCall(PetscLogGpuTimeEnd());
510   PetscFunctionReturn(PETSC_SUCCESS);
511 }
512 
513 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
514 {
515   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
516 
517   PetscFunctionBegin;
518   switch (op) {
519   case MAT_FORM_EXPLICIT_TRANSPOSE:
520     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
521     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
522     A->form_explicit_transpose = flg;
523     break;
524   default:
525     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
526     break;
527   }
528   PetscFunctionReturn(PETSC_SUCCESS);
529 }
530 
531 /* Depending on reuse, either build a new mat, or use the existing mat */
532 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
533 {
534   Mat_SeqAIJ *aseq;
535 
536   PetscFunctionBegin;
537   PetscCall(PetscKokkosInitializeCheck());
538   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
539     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
540   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
541     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
542   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
543     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
544     PetscCall(PetscFree(A->defaultvectype));
545     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
546     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
547     PetscCall(MatSetOps_SeqAIJKokkos(A));
548     aseq = static_cast<Mat_SeqAIJ *>(A->data);
549     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
550       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
551       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
552     }
553   }
554   PetscFunctionReturn(PETSC_SUCCESS);
555 }
556 
557 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
558    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
559  */
560 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
561 {
562   Mat_SeqAIJ       *bseq;
563   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
564   Mat               mat;
565 
566   PetscFunctionBegin;
567   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
568   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
569   mat = *B;
570   if (A->assembled) {
571     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
572     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
573     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
574     /* Now copy values to B if needed */
575     if (dupOption == MAT_COPY_VALUES) {
576       if (akok->a_dual.need_sync_device()) {
577         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
578         bkok->a_dual.modify_host();
579       } else { /* If device has the latest data, we only copy data on device */
580         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
581         bkok->a_dual.modify_device();
582       }
583     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
584       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
585       bkok->a_dual.modify_host();
586     }
587     mat->spptr = bkok;
588   }
589 
590   PetscCall(PetscFree(mat->defaultvectype));
591   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
592   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
593   PetscCall(MatSetOps_SeqAIJKokkos(mat));
594   PetscFunctionReturn(PETSC_SUCCESS);
595 }
596 
597 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
598 {
599   Mat               At;
600   KokkosCsrMatrix   internT;
601   Mat_SeqAIJKokkos *atkok, *bkok;
602 
603   PetscFunctionBegin;
604   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
605   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
606   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
607     /* Deep copy internT, as we want to isolate the internal transpose */
608     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
609     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
610     if (reuse == MAT_INITIAL_MATRIX) *B = At;
611     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
612   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
613     if ((*B)->assembled) {
614       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
615       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
616       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
617     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
618       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
619       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
620       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
621       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
622       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
623     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
624   }
625   PetscFunctionReturn(PETSC_SUCCESS);
626 }
627 
628 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
629 {
630   Mat_SeqAIJKokkos *aijkok;
631 
632   PetscFunctionBegin;
633   if (A->factortype == MAT_FACTOR_NONE) {
634     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
635     delete aijkok;
636   } else {
637     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
638   }
639   A->spptr = NULL;
640   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
641   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
642   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
643   PetscCall(MatDestroy_SeqAIJ(A));
644   PetscFunctionReturn(PETSC_SUCCESS);
645 }
646 
647 /*MC
648    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
649 
650    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
651 
652    Options Database Key:
653 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
654 
655   Level: beginner
656 
657 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
658 M*/
659 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
660 {
661   PetscFunctionBegin;
662   PetscCall(PetscKokkosInitializeCheck());
663   PetscCall(MatCreate_SeqAIJ(A));
664   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
665   PetscFunctionReturn(PETSC_SUCCESS);
666 }
667 
668 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
669 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
670 {
671   Mat_SeqAIJ         *a, *b;
672   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
673   MatScalarKokkosView aa, ba, ca;
674   MatRowMapKokkosView ai, bi, ci;
675   MatColIdxKokkosView aj, bj, cj;
676   PetscInt            m, n, nnz, aN;
677 
678   PetscFunctionBegin;
679   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
680   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
681   PetscAssertPointer(C, 4);
682   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
683   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
684   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
685   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
686 
687   PetscCall(MatSeqAIJKokkosSyncDevice(A));
688   PetscCall(MatSeqAIJKokkosSyncDevice(B));
689   a    = static_cast<Mat_SeqAIJ *>(A->data);
690   b    = static_cast<Mat_SeqAIJ *>(B->data);
691   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
692   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
693   aa   = akok->a_dual.view_device();
694   ai   = akok->i_dual.view_device();
695   ba   = bkok->a_dual.view_device();
696   bi   = bkok->i_dual.view_device();
697   m    = A->rmap->n; /* M, N and nnz of C */
698   n    = A->cmap->n + B->cmap->n;
699   nnz  = a->nz + b->nz;
700   aN   = A->cmap->n; /* N of A */
701   if (reuse == MAT_INITIAL_MATRIX) {
702     aj           = akok->j_dual.view_device();
703     bj           = bkok->j_dual.view_device();
704     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
705     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
706     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
707     ca           = ca_dual.view_device();
708     ci           = ci_dual.view_device();
709     cj           = cj_dual.view_device();
710 
711     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
712     Kokkos::parallel_for(
713       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
714         PetscInt i       = t.league_rank(); /* row i */
715         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
716 
717         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
718                                                    ci(i) = coffset;
719                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
720         });
721 
722         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
723           if (k < alen) {
724             ca(coffset + k) = aa(ai(i) + k);
725             cj(coffset + k) = aj(ai(i) + k);
726           } else {
727             ca(coffset + k) = ba(bi(i) + k - alen);
728             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
729           }
730         });
731       });
732     ca_dual.modify_device();
733     ci_dual.modify_device();
734     cj_dual.modify_device();
735     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
736     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
737   } else if (reuse == MAT_REUSE_MATRIX) {
738     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
739     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
740     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
741     ca   = ckok->a_dual.view_device();
742     ci   = ckok->i_dual.view_device();
743 
744     Kokkos::parallel_for(
745       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
746         PetscInt i    = t.league_rank(); /* row i */
747         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
748         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
749           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
750           else ca(ci(i) + k) = ba(bi(i) + k - alen);
751         });
752       });
753     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
754   }
755   PetscFunctionReturn(PETSC_SUCCESS);
756 }
757 
758 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
759 {
760   PetscFunctionBegin;
761   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
762   PetscFunctionReturn(PETSC_SUCCESS);
763 }
764 
765 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
766 {
767   Mat_Product                 *product = C->product;
768   Mat                          A, B;
769   bool                         transA, transB; /* use bool, since KK needs this type */
770   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
771   Mat_SeqAIJ                  *c;
772   MatProductData_SeqAIJKokkos *pdata;
773   KokkosCsrMatrix              csrmatA, csrmatB;
774 
775   PetscFunctionBegin;
776   MatCheckProduct(C, 1);
777   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
778   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
779 
780   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
781   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
782   // we still do numeric.
783   if (pdata->reusesym) { // numeric reuses results from symbolic
784     pdata->reusesym = PETSC_FALSE;
785     PetscFunctionReturn(PETSC_SUCCESS);
786   }
787 
788   switch (product->type) {
789   case MATPRODUCT_AB:
790     transA = false;
791     transB = false;
792     break;
793   case MATPRODUCT_AtB:
794     transA = true;
795     transB = false;
796     break;
797   case MATPRODUCT_ABt:
798     transA = false;
799     transB = true;
800     break;
801   default:
802     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
803   }
804 
805   A = product->A;
806   B = product->B;
807   PetscCall(MatSeqAIJKokkosSyncDevice(A));
808   PetscCall(MatSeqAIJKokkosSyncDevice(B));
809   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
810   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
811   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
812 
813   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
814 
815   csrmatA = akok->csrmat;
816   csrmatB = bkok->csrmat;
817 
818   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
819   if (transA) {
820     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
821     transA = false;
822   }
823 
824   if (transB) {
825     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
826     transB = false;
827   }
828   PetscCall(PetscLogGpuTimeBegin());
829   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
830 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
831   auto spgemmHandle = pdata->kh.get_spgemm_handle();
832   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
833 #endif
834 
835   PetscCall(PetscLogGpuTimeEnd());
836   PetscCall(MatSeqAIJKokkosModifyDevice(C));
837   /* shorter version of MatAssemblyEnd_SeqAIJ */
838   c = (Mat_SeqAIJ *)C->data;
839   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
840   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
841   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
842   c->reallocs         = 0;
843   C->info.mallocs     = 0;
844   C->info.nz_unneeded = 0;
845   C->assembled = C->was_assembled = PETSC_TRUE;
846   C->num_ass++;
847   PetscFunctionReturn(PETSC_SUCCESS);
848 }
849 
850 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
851 {
852   Mat_Product                 *product = C->product;
853   MatProductType               ptype;
854   Mat                          A, B;
855   bool                         transA, transB;
856   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
857   MatProductData_SeqAIJKokkos *pdata;
858   MPI_Comm                     comm;
859   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
860 
861   PetscFunctionBegin;
862   MatCheckProduct(C, 1);
863   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
864   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
865   A = product->A;
866   B = product->B;
867   PetscCall(MatSeqAIJKokkosSyncDevice(A));
868   PetscCall(MatSeqAIJKokkosSyncDevice(B));
869   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
870   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
871   csrmatA = akok->csrmat;
872   csrmatB = bkok->csrmat;
873 
874   ptype = product->type;
875   // Take advantage of the symmetry if true
876   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
877     ptype                                          = MATPRODUCT_AB;
878     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
879   }
880   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
881     ptype                                          = MATPRODUCT_AB;
882     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
883   }
884 
885   switch (ptype) {
886   case MATPRODUCT_AB:
887     transA = false;
888     transB = false;
889     break;
890   case MATPRODUCT_AtB:
891     transA = true;
892     transB = false;
893     break;
894   case MATPRODUCT_ABt:
895     transA = false;
896     transB = true;
897     break;
898   default:
899     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
900   }
901   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
902   pdata->reusesym = product->api_user;
903 
904   /* TODO: add command line options to select spgemm algorithms */
905   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
906 
907   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
908 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
909   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
910   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
911   #endif
912 #endif
913   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
914 
915   PetscCall(PetscLogGpuTimeBegin());
916   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
917   if (transA) {
918     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
919     transA = false;
920   }
921 
922   if (transB) {
923     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
924     transB = false;
925   }
926 
927   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
928   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
929     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
930     calling new Mat_SeqAIJKokkos().
931     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
932   */
933   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
934 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
935   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
936   auto spgemmHandle = pdata->kh.get_spgemm_handle();
937   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
938 #endif
939   PetscCall(PetscLogGpuTimeEnd());
940 
941   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
942   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
943   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
944   PetscFunctionReturn(PETSC_SUCCESS);
945 }
946 
947 /* handles sparse matrix matrix ops */
948 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
949 {
950   Mat_Product *product = mat->product;
951   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
952 
953   PetscFunctionBegin;
954   MatCheckProduct(mat, 1);
955   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
956   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
957   if (Biskok && Ciskok) {
958     switch (product->type) {
959     case MATPRODUCT_AB:
960     case MATPRODUCT_AtB:
961     case MATPRODUCT_ABt:
962       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
963       break;
964     case MATPRODUCT_PtAP:
965     case MATPRODUCT_RARt:
966     case MATPRODUCT_ABC:
967       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
968       break;
969     default:
970       break;
971     }
972   } else { /* fallback for AIJ */
973     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
974   }
975   PetscFunctionReturn(PETSC_SUCCESS);
976 }
977 
978 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
979 {
980   Mat_SeqAIJKokkos *aijkok;
981 
982   PetscFunctionBegin;
983   PetscCall(PetscLogGpuTimeBegin());
984   PetscCall(MatSeqAIJKokkosSyncDevice(A));
985   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
986   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
987   PetscCall(MatSeqAIJKokkosModifyDevice(A));
988   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
989   PetscCall(PetscLogGpuTimeEnd());
990   PetscFunctionReturn(PETSC_SUCCESS);
991 }
992 
993 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
994 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
995 {
996   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
997 
998   PetscFunctionBegin;
999   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1000     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1001 
1002     PetscCall(PetscLogGpuTimeBegin());
1003     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1004     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1005     const auto &Aa     = aijkok->a_dual.view_device();
1006     const auto &Adiag  = aijkok->diag_dual.view_device();
1007     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1008     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1009     PetscCall(PetscLogGpuFlops(n));
1010     PetscCall(PetscLogGpuTimeEnd());
1011   } else { // need reassembly, very slow!
1012     PetscCall(MatShift_Basic(A, a));
1013   }
1014   PetscFunctionReturn(PETSC_SUCCESS);
1015 }
1016 
1017 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1018 {
1019   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1020 
1021   PetscFunctionBegin;
1022   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1023     ConstPetscScalarKokkosView dv;
1024     PetscInt                   n, nv;
1025 
1026     PetscCall(PetscLogGpuTimeBegin());
1027     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1028     PetscCall(VecGetKokkosView(D, &dv));
1029     PetscCall(VecGetLocalSize(D, &nv));
1030     n = PetscMin(Y->rmap->n, Y->cmap->n);
1031     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1032 
1033     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1034     const auto &Aa     = aijkok->a_dual.view_device();
1035     const auto &Adiag  = aijkok->diag_dual.view_device();
1036     PetscCallCXX(Kokkos::parallel_for(
1037       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1038         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1039         else Aa(Adiag(i)) += dv(i);
1040       }));
1041     PetscCall(VecRestoreKokkosView(D, &dv));
1042     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1043     PetscCall(PetscLogGpuFlops(n));
1044     PetscCall(PetscLogGpuTimeEnd());
1045   } else { // need reassembly, very slow!
1046     PetscCall(MatDiagonalSet_Default(Y, D, is));
1047   }
1048   PetscFunctionReturn(PETSC_SUCCESS);
1049 }
1050 
1051 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1052 {
1053   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1054   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1055   ConstPetscScalarKokkosView lv, rv;
1056 
1057   PetscFunctionBegin;
1058   PetscCall(PetscLogGpuTimeBegin());
1059   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1060   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1061   const auto &Aa     = aijkok->a_dual.view_device();
1062   const auto &Ai     = aijkok->i_dual.view_device();
1063   const auto &Aj     = aijkok->j_dual.view_device();
1064   if (ll) {
1065     PetscCall(VecGetLocalSize(ll, &m));
1066     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1067     PetscCall(VecGetKokkosView(ll, &lv));
1068     PetscCallCXX(Kokkos::parallel_for( // for each row
1069       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1070         PetscInt i   = t.league_rank(); // row i
1071         PetscInt len = Ai(i + 1) - Ai(i);
1072         // scale entries on the row
1073         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1074       }));
1075     PetscCall(VecRestoreKokkosView(ll, &lv));
1076     PetscCall(PetscLogGpuFlops(nz));
1077   }
1078   if (rr) {
1079     PetscCall(VecGetLocalSize(rr, &n));
1080     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1081     PetscCall(VecGetKokkosView(rr, &rv));
1082     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1083       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1084     PetscCall(VecRestoreKokkosView(rr, &lv));
1085     PetscCall(PetscLogGpuFlops(nz));
1086   }
1087   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1088   PetscCall(PetscLogGpuTimeEnd());
1089   PetscFunctionReturn(PETSC_SUCCESS);
1090 }
1091 
1092 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1093 {
1094   Mat_SeqAIJKokkos *aijkok;
1095 
1096   PetscFunctionBegin;
1097   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1098   if (aijkok) { /* Only zero the device if data is already there */
1099     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1100     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1101   } else { /* Might be preallocated but not assembled */
1102     PetscCall(MatZeroEntries_SeqAIJ(A));
1103   }
1104   PetscFunctionReturn(PETSC_SUCCESS);
1105 }
1106 
1107 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1108 {
1109   Mat_SeqAIJKokkos     *aijkok;
1110   PetscInt              n;
1111   PetscScalarKokkosView xv;
1112 
1113   PetscFunctionBegin;
1114   PetscCall(VecGetLocalSize(x, &n));
1115   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1116   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1117 
1118   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1119   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1120 
1121   const auto &Aa    = aijkok->a_dual.view_device();
1122   const auto &Ai    = aijkok->i_dual.view_device();
1123   const auto &Adiag = aijkok->diag_dual.view_device();
1124 
1125   PetscCall(VecGetKokkosViewWrite(x, &xv));
1126   Kokkos::parallel_for(
1127     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1128       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1129       else xv(i) = 0;
1130     });
1131   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1132   PetscFunctionReturn(PETSC_SUCCESS);
1133 }
1134 
1135 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1136 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1137 {
1138   Mat_SeqAIJKokkos *aijkok;
1139 
1140   PetscFunctionBegin;
1141   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1142   PetscAssertPointer(kv, 2);
1143   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1144   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1145   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1146   *kv    = aijkok->a_dual.view_device();
1147   PetscFunctionReturn(PETSC_SUCCESS);
1148 }
1149 
1150 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1151 {
1152   PetscFunctionBegin;
1153   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1154   PetscAssertPointer(kv, 2);
1155   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1156   PetscFunctionReturn(PETSC_SUCCESS);
1157 }
1158 
1159 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1160 {
1161   Mat_SeqAIJKokkos *aijkok;
1162 
1163   PetscFunctionBegin;
1164   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1165   PetscAssertPointer(kv, 2);
1166   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1167   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1168   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1169   *kv    = aijkok->a_dual.view_device();
1170   PetscFunctionReturn(PETSC_SUCCESS);
1171 }
1172 
1173 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1174 {
1175   PetscFunctionBegin;
1176   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1177   PetscAssertPointer(kv, 2);
1178   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1179   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1180   PetscFunctionReturn(PETSC_SUCCESS);
1181 }
1182 
1183 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1184 {
1185   Mat_SeqAIJKokkos *aijkok;
1186 
1187   PetscFunctionBegin;
1188   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1189   PetscAssertPointer(kv, 2);
1190   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1191   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1192   *kv    = aijkok->a_dual.view_device();
1193   PetscFunctionReturn(PETSC_SUCCESS);
1194 }
1195 
1196 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1197 {
1198   PetscFunctionBegin;
1199   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1200   PetscAssertPointer(kv, 2);
1201   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1202   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1203   PetscFunctionReturn(PETSC_SUCCESS);
1204 }
1205 
1206 /* Computes Y += alpha X */
1207 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1208 {
1209   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1210   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1211   ConstMatScalarKokkosView Xa;
1212   MatScalarKokkosView      Ya;
1213   auto                    &exec = PetscGetKokkosExecutionSpace();
1214 
1215   PetscFunctionBegin;
1216   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1217   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1218   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1219   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1220   PetscCall(PetscLogGpuTimeBegin());
1221 
1222   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1223     PetscBool e;
1224     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1225     if (e) {
1226       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1227       if (e) pattern = SAME_NONZERO_PATTERN;
1228     }
1229   }
1230 
1231   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1232     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1233     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1234     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1235   */
1236   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1237   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1238   Xa   = xkok->a_dual.view_device();
1239   Ya   = ykok->a_dual.view_device();
1240 
1241   if (pattern == SAME_NONZERO_PATTERN) {
1242     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1243     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1244   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1245     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1246     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1247 
1248     Kokkos::parallel_for(
1249       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1250         PetscInt i = t.league_rank(); // row i
1251         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1252           // Only one thread works in a team
1253           PetscInt p, q = Yi(i);
1254           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1255             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1256             if (Xj(p) == Yj(q)) {                        // Found it
1257               Ya(q) += alpha * Xa(p);
1258               q++;
1259             } else {
1260             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1261             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1262 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1263               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1264 #else
1265               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1266 #endif
1267             }
1268           }
1269         });
1270       });
1271     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1272   } else { // different nonzero patterns
1273     Mat             Z;
1274     KokkosCsrMatrix zcsr;
1275     KernelHandle    kh;
1276     kh.create_spadd_handle(true); // X, Y are sorted
1277     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1278     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1279     zkok = new Mat_SeqAIJKokkos(zcsr);
1280     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1281     PetscCall(MatHeaderReplace(Y, &Z));
1282     kh.destroy_spadd_handle();
1283   }
1284   PetscCall(PetscLogGpuTimeEnd());
1285   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1286   PetscFunctionReturn(PETSC_SUCCESS);
1287 }
1288 
1289 struct MatCOOStruct_SeqAIJKokkos {
1290   PetscCount           n;
1291   PetscCount           Atot;
1292   PetscInt             nz;
1293   PetscCountKokkosView jmap;
1294   PetscCountKokkosView perm;
1295 
1296   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1297   {
1298     nz   = coo_h->nz;
1299     n    = coo_h->n;
1300     Atot = coo_h->Atot;
1301     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1302     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1303   }
1304 };
1305 
1306 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
1307 {
1308   PetscFunctionBegin;
1309   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
1310   PetscFunctionReturn(PETSC_SUCCESS);
1311 }
1312 
1313 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1314 {
1315   Mat_SeqAIJKokkos          *akok;
1316   Mat_SeqAIJ                *aseq;
1317   PetscContainer             container_h, container_d;
1318   MatCOOStruct_SeqAIJ       *coo_h;
1319   MatCOOStruct_SeqAIJKokkos *coo_d;
1320 
1321   PetscFunctionBegin;
1322   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1323   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1324   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1325   delete akok;
1326   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1327   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1328 
1329   // Copy the COO struct to device
1330   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1331   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1332   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1333 
1334   // Put the COO struct in a container and then attach that to the matrix
1335   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
1336   PetscCall(PetscContainerSetPointer(container_d, coo_d));
1337   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos));
1338   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
1339   PetscCall(PetscContainerDestroy(&container_d));
1340   PetscFunctionReturn(PETSC_SUCCESS);
1341 }
1342 
1343 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1344 {
1345   MatScalarKokkosView        Aa;
1346   ConstMatScalarKokkosView   kv;
1347   PetscMemType               memtype;
1348   PetscContainer             container;
1349   MatCOOStruct_SeqAIJKokkos *coo;
1350 
1351   PetscFunctionBegin;
1352   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1353   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1354 
1355   const auto &n    = coo->n;
1356   const auto &Annz = coo->nz;
1357   const auto &jmap = coo->jmap;
1358   const auto &perm = coo->perm;
1359 
1360   PetscCall(PetscGetMemType(v, &memtype));
1361   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1362     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1363   } else {
1364     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1365   }
1366 
1367   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1368   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1369 
1370   PetscCall(PetscLogGpuTimeBegin());
1371   Kokkos::parallel_for(
1372     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1373       PetscScalar sum = 0.0;
1374       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1375       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1376     });
1377   PetscCall(PetscLogGpuTimeEnd());
1378 
1379   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1380   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1381   PetscFunctionReturn(PETSC_SUCCESS);
1382 }
1383 
1384 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1385 {
1386   PetscFunctionBegin;
1387   PetscCall(MatSeqAIJKokkosSyncHost(A));
1388   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1389   B->offloadmask = PETSC_OFFLOAD_CPU;
1390   PetscFunctionReturn(PETSC_SUCCESS);
1391 }
1392 
1393 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1394 {
1395   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1396 
1397   PetscFunctionBegin;
1398   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1399   A->boundtocpu  = PETSC_FALSE;
1400 
1401   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1402   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1403   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1404   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1405   A->ops->scale                     = MatScale_SeqAIJKokkos;
1406   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1407   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1408   A->ops->mult                      = MatMult_SeqAIJKokkos;
1409   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1410   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1411   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1412   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1413   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1414   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1415   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1416   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1417   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1418   A->ops->shift                     = MatShift_SeqAIJKokkos;
1419   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1420   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1421   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1422   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1423   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1424   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1425   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1426   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1427   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1428 
1429   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1430   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1431   PetscFunctionReturn(PETSC_SUCCESS);
1432 }
1433 
1434 /*
1435    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1436 
1437   Input Parameters:
1438 +  A       - the MATSEQAIJKOKKOS matrix
1439 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1440 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1441 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1442 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1443 
1444   Output Parameter:
1445 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1446 */
1447 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1448 {
1449   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1450   PetscInt          nblocks = bs.extent(0) - 1;
1451 
1452   PetscFunctionBegin;
1453   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1454 
1455   // Pull out the diagonal blocks of the matrix and then invert the blocks
1456   auto Aa    = akok->a_dual.view_device();
1457   auto Ai    = akok->i_dual.view_device();
1458   auto Aj    = akok->j_dual.view_device();
1459   auto Adiag = akok->diag_dual.view_device();
1460   // TODO: how to tune the team size?
1461 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
1462   auto ts = Kokkos::AUTO();
1463 #else
1464   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1465 #endif
1466   PetscCallCXX(Kokkos::parallel_for(
1467     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1468       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1469       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1470       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1471       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1472       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1473 
1474       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1475         PetscInt i = rstart + r;                                                            // i-th row in A
1476 
1477         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1478           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1479 
1480           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1481             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1482               B(r, c) = 0.0;
1483             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1484               B(r, c) = Aa(first + c);
1485             } else { // this entry does not show up in the CSR
1486               B(r, c) = 0.0;
1487             }
1488           }
1489         } else { // rare case that the diagonal does not exist
1490           const PetscInt begin = Ai(i);
1491           const PetscInt end   = Ai(i + 1);
1492           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1493           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1494             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1495             else if (Aj(j) >= rstart + m) break;
1496           }
1497         }
1498       });
1499 
1500       // LU-decompose B (w/o pivoting) and then invert B
1501       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1502       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1503     }));
1504   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1505   PetscFunctionReturn(PETSC_SUCCESS);
1506 }
1507 
1508 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1509 {
1510   Mat_SeqAIJ *aseq;
1511   PetscInt    i, m, n;
1512   auto       &exec = PetscGetKokkosExecutionSpace();
1513 
1514   PetscFunctionBegin;
1515   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1516 
1517   m = akok->nrows();
1518   n = akok->ncols();
1519   PetscCall(MatSetSizes(A, m, n, m, n));
1520   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1521 
1522   /* Set up data structures of A as a MATSEQAIJ */
1523   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1524   aseq = (Mat_SeqAIJ *)(A)->data;
1525 
1526   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1527   PetscCallCXX(akok->j_dual.sync_host(exec));
1528   PetscCallCXX(exec.fence());
1529 
1530   aseq->i            = akok->i_host_data();
1531   aseq->j            = akok->j_host_data();
1532   aseq->a            = akok->a_host_data();
1533   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1534   aseq->singlemalloc = PETSC_FALSE;
1535   aseq->free_a       = PETSC_FALSE;
1536   aseq->free_ij      = PETSC_FALSE;
1537   aseq->nz           = akok->nnz();
1538   aseq->maxnz        = aseq->nz;
1539 
1540   PetscCall(PetscMalloc1(m, &aseq->imax));
1541   PetscCall(PetscMalloc1(m, &aseq->ilen));
1542   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1543 
1544   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1545   akok->nonzerostate = A->nonzerostate;
1546   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1547   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1548   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1549   PetscFunctionReturn(PETSC_SUCCESS);
1550 }
1551 
1552 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1553 {
1554   PetscFunctionBegin;
1555   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1556   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1557   PetscFunctionReturn(PETSC_SUCCESS);
1558 }
1559 
1560 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1561 {
1562   Mat_SeqAIJKokkos *akok;
1563 
1564   PetscFunctionBegin;
1565   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1566   PetscCall(MatCreate(comm, A));
1567   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1568   PetscFunctionReturn(PETSC_SUCCESS);
1569 }
1570 
1571 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1572 
1573    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1574  */
1575 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1576 {
1577   PetscFunctionBegin;
1578   PetscCall(MatCreate(comm, A));
1579   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1580   PetscFunctionReturn(PETSC_SUCCESS);
1581 }
1582 
1583 /*@C
1584   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1585   (the default parallel PETSc format). This matrix will ultimately be handled by
1586   Kokkos for calculations.
1587 
1588   Collective
1589 
1590   Input Parameters:
1591 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1592 . m    - number of rows
1593 . n    - number of columns
1594 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1595 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1596 
1597   Output Parameter:
1598 . A - the matrix
1599 
1600   Level: intermediate
1601 
1602   Notes:
1603   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1604   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1605   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1606 
1607   The AIJ format, also called
1608   compressed row storage, is fully compatible with standard Fortran
1609   storage.  That is, the stored row and column indices can begin at
1610   either one (as in Fortran) or zero.
1611 
1612   Specify the preallocated storage with either `nz` or `nnz` (not both).
1613   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1614   allocation.
1615 
1616 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1617 @*/
1618 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1619 {
1620   PetscFunctionBegin;
1621   PetscCall(PetscKokkosInitializeCheck());
1622   PetscCall(MatCreate(comm, A));
1623   PetscCall(MatSetSizes(*A, m, n, m, n));
1624   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1625   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1626   PetscFunctionReturn(PETSC_SUCCESS);
1627 }
1628 
1629 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1630 {
1631   PetscFunctionBegin;
1632   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1633   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1634   PetscFunctionReturn(PETSC_SUCCESS);
1635 }
1636 
1637 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1638 {
1639   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1640 
1641   PetscFunctionBegin;
1642   if (!factors->sptrsv_symbolic_completed) {
1643     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1644     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1645     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1646   }
1647   PetscFunctionReturn(PETSC_SUCCESS);
1648 }
1649 
1650 /* Check if we need to update factors etc for transpose solve */
1651 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1652 {
1653   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1654   MatColIdxType               n       = A->rmap->n;
1655 
1656   PetscFunctionBegin;
1657   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1658     /* Update L^T and do sptrsv symbolic */
1659     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1660     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1661     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1662 
1663     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1664                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1665 
1666     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1667       We have to sort the indices, until KK provides finer control options.
1668     */
1669     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1670 
1671     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1672 
1673     /* Update U^T and do sptrsv symbolic */
1674     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1675     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1676     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1677 
1678     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1679                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1680 
1681     /* Sort indices. See comments above */
1682     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1683 
1684     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1685     factors->transpose_updated = PETSC_TRUE;
1686   }
1687   PetscFunctionReturn(PETSC_SUCCESS);
1688 }
1689 
1690 /* Solve Ax = b, with A = LU */
1691 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1692 {
1693   ConstPetscScalarKokkosView  bv;
1694   PetscScalarKokkosView       xv;
1695   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1696 
1697   PetscFunctionBegin;
1698   PetscCall(PetscLogGpuTimeBegin());
1699   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1700   PetscCall(VecGetKokkosView(b, &bv));
1701   PetscCall(VecGetKokkosViewWrite(x, &xv));
1702   /* Solve L tmpv = b */
1703   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1704   /* Solve Ux = tmpv */
1705   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1706   PetscCall(VecRestoreKokkosView(b, &bv));
1707   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1708   PetscCall(PetscLogGpuTimeEnd());
1709   PetscFunctionReturn(PETSC_SUCCESS);
1710 }
1711 
1712 /* Solve A^T x = b, where A^T = U^T L^T */
1713 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1714 {
1715   ConstPetscScalarKokkosView  bv;
1716   PetscScalarKokkosView       xv;
1717   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1718 
1719   PetscFunctionBegin;
1720   PetscCall(PetscLogGpuTimeBegin());
1721   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1722   PetscCall(VecGetKokkosView(b, &bv));
1723   PetscCall(VecGetKokkosViewWrite(x, &xv));
1724   /* Solve U^T tmpv = b */
1725   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1726 
1727   /* Solve L^T x = tmpv */
1728   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1729   PetscCall(VecRestoreKokkosView(b, &bv));
1730   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1731   PetscCall(PetscLogGpuTimeEnd());
1732   PetscFunctionReturn(PETSC_SUCCESS);
1733 }
1734 
1735 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1736 {
1737   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1738   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1739   PetscInt                    fill_lev = info->levels;
1740 
1741   PetscFunctionBegin;
1742   PetscCall(PetscLogGpuTimeBegin());
1743   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1744 
1745   auto a_d = aijkok->a_dual.view_device();
1746   auto i_d = aijkok->i_dual.view_device();
1747   auto j_d = aijkok->j_dual.view_device();
1748 
1749   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1750 
1751   B->assembled              = PETSC_TRUE;
1752   B->preallocated           = PETSC_TRUE;
1753   B->ops->solve             = MatSolve_SeqAIJKokkos;
1754   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1755   B->ops->matsolve          = NULL;
1756   B->ops->matsolvetranspose = NULL;
1757   B->offloadmask            = PETSC_OFFLOAD_GPU;
1758 
1759   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1760   factors->transpose_updated         = PETSC_FALSE;
1761   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1762   /* TODO: log flops, but how to know that? */
1763   PetscCall(PetscLogGpuTimeEnd());
1764   PetscFunctionReturn(PETSC_SUCCESS);
1765 }
1766 
1767 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1768 {
1769   Mat_SeqAIJKokkos           *aijkok;
1770   Mat_SeqAIJ                 *b;
1771   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1772   PetscInt                    fill_lev = info->levels;
1773   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1774   PetscInt                    n        = A->rmap->n;
1775 
1776   PetscFunctionBegin;
1777   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1778   /* Rebuild factors */
1779   if (factors) {
1780     factors->Destroy();
1781   } /* Destroy the old if it exists */
1782   else {
1783     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1784   }
1785 
1786   /* Create a spiluk handle and then do symbolic factorization */
1787   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1788   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1789 
1790   auto spiluk_handle = factors->kh.get_spiluk_handle();
1791 
1792   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1793   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1794   Kokkos::realloc(factors->iU_d, n + 1);
1795   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1796 
1797   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1798   auto i_d = aijkok->i_dual.view_device();
1799   auto j_d = aijkok->j_dual.view_device();
1800   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1801   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1802 
1803   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1804   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1805   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1806   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1807 
1808   /* TODO: add options to select sptrsv algorithms */
1809   /* Create sptrsv handles for L, U and their transpose */
1810 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1811   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1812 #else
1813   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1814 #endif
1815 
1816   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1817   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1818   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1819   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1820 
1821   /* Fill fields of the factor matrix B */
1822   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1823   b     = (Mat_SeqAIJ *)B->data;
1824   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1825   B->info.fill_ratio_given  = info->fill;
1826   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1827 
1828   B->offloadmask          = PETSC_OFFLOAD_GPU;
1829   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1830   PetscFunctionReturn(PETSC_SUCCESS);
1831 }
1832 
1833 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1834 {
1835   PetscFunctionBegin;
1836   *type = MATSOLVERKOKKOS;
1837   PetscFunctionReturn(PETSC_SUCCESS);
1838 }
1839 
1840 /*MC
1841   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1842   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1843 
1844   Level: beginner
1845 
1846 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1847 M*/
1848 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1849 {
1850   PetscInt n = A->rmap->n;
1851 
1852   PetscFunctionBegin;
1853   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1854   PetscCall(MatSetSizes(*B, n, n, n, n));
1855   (*B)->factortype = ftype;
1856   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1857   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1858 
1859   if (ftype == MAT_FACTOR_LU) {
1860     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1861     (*B)->canuseordering        = PETSC_TRUE;
1862     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1863   } else if (ftype == MAT_FACTOR_ILU) {
1864     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1865     (*B)->canuseordering         = PETSC_FALSE;
1866     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1867   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1868 
1869   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1870   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1871   PetscFunctionReturn(PETSC_SUCCESS);
1872 }
1873 
1874 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1875 {
1876   PetscFunctionBegin;
1877   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1878   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1879   PetscFunctionReturn(PETSC_SUCCESS);
1880 }
1881 
1882 /* Utility to print out a KokkosCsrMatrix for debugging */
1883 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1884 {
1885   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1886   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1887   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1888   const PetscInt    *i  = iv.data();
1889   const PetscInt    *j  = jv.data();
1890   const PetscScalar *a  = av.data();
1891   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1892 
1893   PetscFunctionBegin;
1894   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1895   for (PetscInt k = 0; k < m; k++) {
1896     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1897     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1898     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1899   }
1900   PetscFunctionReturn(PETSC_SUCCESS);
1901 }
1902