xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision e91c04dfc8a52dee1965211bb1cc8e5bf775178f)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscpkg_version.h>
4 #include <petsc/private/petscimpl.h>
5 #include <petsc/private/sfimpl.h>
6 #include <petscsystypes.h>
7 #include <petscerror.h>
8 
9 #include <Kokkos_Core.hpp>
10 #include <KokkosBlas.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 #include <KokkosBatched_LU_Decl.hpp>
18 #include <KokkosBatched_InverseLU_Decl.hpp>
19 
20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
21 
22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
23   #include <KokkosSparse_Utils.hpp>
24 using KokkosSparse::sort_crs_matrix;
25 using KokkosSparse::Impl::transpose_matrix;
26 #else
27   #include <KokkosKernels_Sorting.hpp>
28 using KokkosKernels::sort_crs_matrix;
29 using KokkosKernels::Impl::transpose_matrix;
30 #endif
31 
32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
33 
34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
35    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
36    In the latter case, it is important to set a_dual's sync state correctly.
37  */
38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
39 {
40   Mat_SeqAIJ       *aijseq;
41   Mat_SeqAIJKokkos *aijkok;
42 
43   PetscFunctionBegin;
44   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
45   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
46 
47   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
48   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
49 
50   /* If aijkok does not exist, we just copy i, j to device.
51      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
52      In both cases, we build a new aijkok structure.
53   */
54   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
55     delete aijkok;
56     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
57     A->spptr = aijkok;
58   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
59     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
60     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
61     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
62   }
63   PetscFunctionReturn(PETSC_SUCCESS);
64 }
65 
66 /* Sync CSR data to device if not yet */
67 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
68 {
69   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
70 
71   PetscFunctionBegin;
72   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
73   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
74   if (aijkok->a_dual.need_sync_device()) {
75     aijkok->a_dual.sync_device();
76     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
77     aijkok->hermitian_updated = PETSC_FALSE;
78   }
79   PetscFunctionReturn(PETSC_SUCCESS);
80 }
81 
82 /* Mark the CSR data on device as modified */
83 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
84 {
85   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
86 
87   PetscFunctionBegin;
88   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
89   aijkok->a_dual.clear_sync_state();
90   aijkok->a_dual.modify_device();
91   aijkok->transpose_updated = PETSC_FALSE;
92   aijkok->hermitian_updated = PETSC_FALSE;
93   PetscCall(MatSeqAIJInvalidateDiagonal(A));
94   PetscCall(PetscObjectStateIncrease((PetscObject)A));
95   PetscFunctionReturn(PETSC_SUCCESS);
96 }
97 
98 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
99 {
100   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
101   auto             &exec   = PetscGetKokkosExecutionSpace();
102 
103   PetscFunctionBegin;
104   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
105   /* We do not expect one needs factors on host  */
106   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
107   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
108   PetscCallCXX(aijkok->a_dual.sync_host(exec));
109   PetscCallCXX(exec.fence());
110   PetscFunctionReturn(PETSC_SUCCESS);
111 }
112 
113 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
114 {
115   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
116 
117   PetscFunctionBegin;
118   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
119     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
120     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
121     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
122   */
123   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
124     auto &exec = PetscGetKokkosExecutionSpace();
125     PetscCallCXX(aijkok->a_dual.sync_host(exec));
126     PetscCallCXX(exec.fence());
127     *array = aijkok->a_dual.view_host().data();
128   } else { /* Happens when calling MatSetValues on a newly created matrix */
129     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
130   }
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
135 {
136   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
137 
138   PetscFunctionBegin;
139   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
140   PetscFunctionReturn(PETSC_SUCCESS);
141 }
142 
143 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
144 {
145   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
146 
147   PetscFunctionBegin;
148   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149     auto &exec = PetscGetKokkosExecutionSpace();
150     PetscCallCXX(aijkok->a_dual.sync_host(exec));
151     PetscCallCXX(exec.fence());
152     *array = aijkok->a_dual.view_host().data();
153   } else {
154     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
155   }
156   PetscFunctionReturn(PETSC_SUCCESS);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
160 {
161   PetscFunctionBegin;
162   *array = NULL;
163   PetscFunctionReturn(PETSC_SUCCESS);
164 }
165 
166 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
167 {
168   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
169 
170   PetscFunctionBegin;
171   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
172     *array = aijkok->a_dual.view_host().data();
173   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
174     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
175   }
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
180 {
181   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
182 
183   PetscFunctionBegin;
184   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
185     aijkok->a_dual.clear_sync_state();
186     aijkok->a_dual.modify_host();
187   }
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
192 {
193   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
194 
195   PetscFunctionBegin;
196   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
197 
198   if (i) *i = aijkok->i_device_data();
199   if (j) *j = aijkok->j_device_data();
200   if (a) {
201     aijkok->a_dual.sync_device();
202     *a = aijkok->a_device_data();
203   }
204   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
205   PetscFunctionReturn(PETSC_SUCCESS);
206 }
207 
208 /*
209   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
210 
211   Input Parameter:
212 .  A       - the MATSEQAIJKOKKOS matrix
213 
214   Output Parameters:
215 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
216 -  T_d    - the transpose on device, whose value array is allocated but not initialized
217 */
218 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
219 {
220   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
221   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
222   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
223   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
224   MatRowMapType          *Ti = Ti_h.data();
225   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
226   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
227   PetscInt               *Tj   = Tj_h.data();
228   PetscInt               *perm = perm_h.data();
229   PetscInt               *offset;
230 
231   PetscFunctionBegin;
232   // Populate Ti
233   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
234   Ti++;
235   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
236   Ti--;
237   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
238 
239   // Populate Tj and the permutation array
240   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
241   for (PetscInt i = 0; i < m; i++) {
242     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
243       PetscInt r    = Aj[j];                       // row r of T
244       PetscInt disp = Ti[r] + offset[r];
245 
246       Tj[disp]   = i; // col i of T
247       perm[disp] = j;
248       offset[r]++;
249     }
250   }
251   PetscCall(PetscFree(offset));
252 
253   // Sort each row of T, along with the permutation array
254   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
255 
256   // Output perm and T on device
257   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
258   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
259   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
260   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
261   PetscFunctionReturn(PETSC_SUCCESS);
262 }
263 
264 // Generate the transpose on device and cache it internally
265 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
266 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
267 {
268   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
269   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
270   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
271   KokkosCsrMatrix  &T = akok->csrmatT;
272 
273   PetscFunctionBegin;
274   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
275   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
276 
277   const auto &Aa = akok->a_dual.view_device();
278 
279   if (A->symmetric == PETSC_BOOL3_TRUE) {
280     *csrmatT = akok->csrmat;
281   } else {
282     // See if we already have a cached transpose and its value is up to date
283     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
284       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
285         const auto &perm = akok->transpose_perm; // get the permutation array
286         auto       &Ta   = T.values;
287 
288         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
289       }
290     } else { // Generate T of size n x m for the first time
291       MatRowMapKokkosView perm;
292 
293       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
294       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
295       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
296     }
297     akok->transpose_updated = PETSC_TRUE;
298     *csrmatT                = akok->csrmatT;
299   }
300   PetscFunctionReturn(PETSC_SUCCESS);
301 }
302 
303 // Generate the Hermitian on device and cache it internally
304 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
305 {
306   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
307   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
308   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
309   KokkosCsrMatrix  &T = akok->csrmatH;
310 
311   PetscFunctionBegin;
312   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
313   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
314 
315   const auto &Aa = akok->a_dual.view_device();
316 
317   if (A->hermitian == PETSC_BOOL3_TRUE) {
318     *csrmatH = akok->csrmat;
319   } else {
320     // See if we already have a cached hermitian and its value is up to date
321     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
322       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
323         const auto &perm = akok->transpose_perm; // get the permutation array
324         auto       &Ta   = T.values;
325 
326         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
327       }
328     } else { // Generate T of size n x m for the first time
329       MatRowMapKokkosView perm;
330 
331       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
332       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
333       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
334     }
335     akok->hermitian_updated = PETSC_TRUE;
336     *csrmatH                = akok->csrmatH;
337   }
338   PetscFunctionReturn(PETSC_SUCCESS);
339 }
340 
341 /* y = A x */
342 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
343 {
344   Mat_SeqAIJKokkos          *aijkok;
345   ConstPetscScalarKokkosView xv;
346   PetscScalarKokkosView      yv;
347 
348   PetscFunctionBegin;
349   PetscCall(PetscLogGpuTimeBegin());
350   PetscCall(MatSeqAIJKokkosSyncDevice(A));
351   PetscCall(VecGetKokkosView(xx, &xv));
352   PetscCall(VecGetKokkosViewWrite(yy, &yv));
353   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
354   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
355   PetscCall(VecRestoreKokkosView(xx, &xv));
356   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
357   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
358   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
359   PetscCall(PetscLogGpuTimeEnd());
360   PetscFunctionReturn(PETSC_SUCCESS);
361 }
362 
363 /* y = A^T x */
364 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
365 {
366   Mat_SeqAIJKokkos          *aijkok;
367   const char                *mode;
368   ConstPetscScalarKokkosView xv;
369   PetscScalarKokkosView      yv;
370   KokkosCsrMatrix            csrmat;
371 
372   PetscFunctionBegin;
373   PetscCall(PetscLogGpuTimeBegin());
374   PetscCall(MatSeqAIJKokkosSyncDevice(A));
375   PetscCall(VecGetKokkosView(xx, &xv));
376   PetscCall(VecGetKokkosViewWrite(yy, &yv));
377   if (A->form_explicit_transpose) {
378     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
379     mode = "N";
380   } else {
381     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
382     csrmat = aijkok->csrmat;
383     mode   = "T";
384   }
385   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
386   PetscCall(VecRestoreKokkosView(xx, &xv));
387   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
388   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
389   PetscCall(PetscLogGpuTimeEnd());
390   PetscFunctionReturn(PETSC_SUCCESS);
391 }
392 
393 /* y = A^H x */
394 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
395 {
396   Mat_SeqAIJKokkos          *aijkok;
397   const char                *mode;
398   ConstPetscScalarKokkosView xv;
399   PetscScalarKokkosView      yv;
400   KokkosCsrMatrix            csrmat;
401 
402   PetscFunctionBegin;
403   PetscCall(PetscLogGpuTimeBegin());
404   PetscCall(MatSeqAIJKokkosSyncDevice(A));
405   PetscCall(VecGetKokkosView(xx, &xv));
406   PetscCall(VecGetKokkosViewWrite(yy, &yv));
407   if (A->form_explicit_transpose) {
408     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
409     mode = "N";
410   } else {
411     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
412     csrmat = aijkok->csrmat;
413     mode   = "C";
414   }
415   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
416   PetscCall(VecRestoreKokkosView(xx, &xv));
417   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
418   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
419   PetscCall(PetscLogGpuTimeEnd());
420   PetscFunctionReturn(PETSC_SUCCESS);
421 }
422 
423 /* z = A x + y */
424 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
425 {
426   Mat_SeqAIJKokkos          *aijkok;
427   ConstPetscScalarKokkosView xv;
428   PetscScalarKokkosView      zv;
429 
430   PetscFunctionBegin;
431   PetscCall(PetscLogGpuTimeBegin());
432   PetscCall(MatSeqAIJKokkosSyncDevice(A));
433   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
434   PetscCall(VecGetKokkosView(xx, &xv));
435   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
436   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
437   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
438   PetscCall(VecRestoreKokkosView(xx, &xv));
439   PetscCall(VecRestoreKokkosView(zz, &zv));
440   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
441   PetscCall(PetscLogGpuTimeEnd());
442   PetscFunctionReturn(PETSC_SUCCESS);
443 }
444 
445 /* z = A^T x + y */
446 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
447 {
448   Mat_SeqAIJKokkos          *aijkok;
449   const char                *mode;
450   ConstPetscScalarKokkosView xv;
451   PetscScalarKokkosView      zv;
452   KokkosCsrMatrix            csrmat;
453 
454   PetscFunctionBegin;
455   PetscCall(PetscLogGpuTimeBegin());
456   PetscCall(MatSeqAIJKokkosSyncDevice(A));
457   if (zz != yy) PetscCall(VecCopy(yy, zz));
458   PetscCall(VecGetKokkosView(xx, &xv));
459   PetscCall(VecGetKokkosView(zz, &zv));
460   if (A->form_explicit_transpose) {
461     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
462     mode = "N";
463   } else {
464     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
465     csrmat = aijkok->csrmat;
466     mode   = "T";
467   }
468   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
469   PetscCall(VecRestoreKokkosView(xx, &xv));
470   PetscCall(VecRestoreKokkosView(zz, &zv));
471   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
472   PetscCall(PetscLogGpuTimeEnd());
473   PetscFunctionReturn(PETSC_SUCCESS);
474 }
475 
476 /* z = A^H x + y */
477 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
478 {
479   Mat_SeqAIJKokkos          *aijkok;
480   const char                *mode;
481   ConstPetscScalarKokkosView xv;
482   PetscScalarKokkosView      zv;
483   KokkosCsrMatrix            csrmat;
484 
485   PetscFunctionBegin;
486   PetscCall(PetscLogGpuTimeBegin());
487   PetscCall(MatSeqAIJKokkosSyncDevice(A));
488   if (zz != yy) PetscCall(VecCopy(yy, zz));
489   PetscCall(VecGetKokkosView(xx, &xv));
490   PetscCall(VecGetKokkosView(zz, &zv));
491   if (A->form_explicit_transpose) {
492     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
493     mode = "N";
494   } else {
495     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
496     csrmat = aijkok->csrmat;
497     mode   = "C";
498   }
499   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
500   PetscCall(VecRestoreKokkosView(xx, &xv));
501   PetscCall(VecRestoreKokkosView(zz, &zv));
502   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
503   PetscCall(PetscLogGpuTimeEnd());
504   PetscFunctionReturn(PETSC_SUCCESS);
505 }
506 
507 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
508 {
509   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
510 
511   PetscFunctionBegin;
512   switch (op) {
513   case MAT_FORM_EXPLICIT_TRANSPOSE:
514     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
515     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
516     A->form_explicit_transpose = flg;
517     break;
518   default:
519     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
520     break;
521   }
522   PetscFunctionReturn(PETSC_SUCCESS);
523 }
524 
525 /* Depending on reuse, either build a new mat, or use the existing mat */
526 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
527 {
528   Mat_SeqAIJ *aseq;
529 
530   PetscFunctionBegin;
531   PetscCall(PetscKokkosInitializeCheck());
532   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
533     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
534   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
535     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
536   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
537     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
538     PetscCall(PetscFree(A->defaultvectype));
539     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
540     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
541     PetscCall(MatSetOps_SeqAIJKokkos(A));
542     aseq = static_cast<Mat_SeqAIJ *>(A->data);
543     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
544       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
545       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
546     }
547   }
548   PetscFunctionReturn(PETSC_SUCCESS);
549 }
550 
551 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
552    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
553  */
554 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
555 {
556   Mat_SeqAIJ       *bseq;
557   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
558   Mat               mat;
559 
560   PetscFunctionBegin;
561   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
562   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
563   mat = *B;
564   if (A->assembled) {
565     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
566     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
567     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
568     /* Now copy values to B if needed */
569     if (dupOption == MAT_COPY_VALUES) {
570       if (akok->a_dual.need_sync_device()) {
571         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
572         bkok->a_dual.modify_host();
573       } else { /* If device has the latest data, we only copy data on device */
574         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
575         bkok->a_dual.modify_device();
576       }
577     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
578       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
579       bkok->a_dual.modify_host();
580     }
581     mat->spptr = bkok;
582   }
583 
584   PetscCall(PetscFree(mat->defaultvectype));
585   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
586   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
587   PetscCall(MatSetOps_SeqAIJKokkos(mat));
588   PetscFunctionReturn(PETSC_SUCCESS);
589 }
590 
591 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
592 {
593   Mat               At;
594   KokkosCsrMatrix   internT;
595   Mat_SeqAIJKokkos *atkok, *bkok;
596 
597   PetscFunctionBegin;
598   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
599   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
600   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
601     /* Deep copy internT, as we want to isolate the internal transpose */
602     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
603     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
604     if (reuse == MAT_INITIAL_MATRIX) *B = At;
605     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
606   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
607     if ((*B)->assembled) {
608       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
609       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
610       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
611     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
612       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
613       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
614       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
615       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
616       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
617     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
618   }
619   PetscFunctionReturn(PETSC_SUCCESS);
620 }
621 
622 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
623 {
624   Mat_SeqAIJKokkos *aijkok;
625 
626   PetscFunctionBegin;
627   if (A->factortype == MAT_FACTOR_NONE) {
628     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
629     delete aijkok;
630   } else {
631     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
632   }
633   A->spptr = NULL;
634   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
635   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
636   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
637 #if defined(PETSC_HAVE_HYPRE)
638   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
639 #endif
640   PetscCall(MatDestroy_SeqAIJ(A));
641   PetscFunctionReturn(PETSC_SUCCESS);
642 }
643 
644 /*MC
645    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
646 
647    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
648 
649    Options Database Key:
650 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
651 
652   Level: beginner
653 
654 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
655 M*/
656 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
657 {
658   PetscFunctionBegin;
659   PetscCall(PetscKokkosInitializeCheck());
660   PetscCall(MatCreate_SeqAIJ(A));
661   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
662   PetscFunctionReturn(PETSC_SUCCESS);
663 }
664 
665 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
666 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
667 {
668   Mat_SeqAIJ         *a, *b;
669   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
670   MatScalarKokkosView aa, ba, ca;
671   MatRowMapKokkosView ai, bi, ci;
672   MatColIdxKokkosView aj, bj, cj;
673   PetscInt            m, n, nnz, aN;
674 
675   PetscFunctionBegin;
676   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
677   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
678   PetscAssertPointer(C, 4);
679   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
680   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
681   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
682   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
683 
684   PetscCall(MatSeqAIJKokkosSyncDevice(A));
685   PetscCall(MatSeqAIJKokkosSyncDevice(B));
686   a    = static_cast<Mat_SeqAIJ *>(A->data);
687   b    = static_cast<Mat_SeqAIJ *>(B->data);
688   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
689   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
690   aa   = akok->a_dual.view_device();
691   ai   = akok->i_dual.view_device();
692   ba   = bkok->a_dual.view_device();
693   bi   = bkok->i_dual.view_device();
694   m    = A->rmap->n; /* M, N and nnz of C */
695   n    = A->cmap->n + B->cmap->n;
696   nnz  = a->nz + b->nz;
697   aN   = A->cmap->n; /* N of A */
698   if (reuse == MAT_INITIAL_MATRIX) {
699     aj           = akok->j_dual.view_device();
700     bj           = bkok->j_dual.view_device();
701     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
702     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
703     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
704     ca           = ca_dual.view_device();
705     ci           = ci_dual.view_device();
706     cj           = cj_dual.view_device();
707 
708     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
709     Kokkos::parallel_for(
710       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
711         PetscInt i       = t.league_rank(); /* row i */
712         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
713 
714         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
715                                                    ci(i) = coffset;
716                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
717         });
718 
719         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
720           if (k < alen) {
721             ca(coffset + k) = aa(ai(i) + k);
722             cj(coffset + k) = aj(ai(i) + k);
723           } else {
724             ca(coffset + k) = ba(bi(i) + k - alen);
725             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
726           }
727         });
728       });
729     ca_dual.modify_device();
730     ci_dual.modify_device();
731     cj_dual.modify_device();
732     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
733     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
734   } else if (reuse == MAT_REUSE_MATRIX) {
735     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
736     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
737     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
738     ca   = ckok->a_dual.view_device();
739     ci   = ckok->i_dual.view_device();
740 
741     Kokkos::parallel_for(
742       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
743         PetscInt i    = t.league_rank(); /* row i */
744         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
745         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
746           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
747           else ca(ci(i) + k) = ba(bi(i) + k - alen);
748         });
749       });
750     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
751   }
752   PetscFunctionReturn(PETSC_SUCCESS);
753 }
754 
755 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
756 {
757   PetscFunctionBegin;
758   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
759   PetscFunctionReturn(PETSC_SUCCESS);
760 }
761 
762 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
763 {
764   Mat_Product                 *product = C->product;
765   Mat                          A, B;
766   bool                         transA, transB; /* use bool, since KK needs this type */
767   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
768   Mat_SeqAIJ                  *c;
769   MatProductData_SeqAIJKokkos *pdata;
770   KokkosCsrMatrix              csrmatA, csrmatB;
771 
772   PetscFunctionBegin;
773   MatCheckProduct(C, 1);
774   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
775   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
776 
777   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
778   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
779   // we still do numeric.
780   if (pdata->reusesym) { // numeric reuses results from symbolic
781     pdata->reusesym = PETSC_FALSE;
782     PetscFunctionReturn(PETSC_SUCCESS);
783   }
784 
785   switch (product->type) {
786   case MATPRODUCT_AB:
787     transA = false;
788     transB = false;
789     break;
790   case MATPRODUCT_AtB:
791     transA = true;
792     transB = false;
793     break;
794   case MATPRODUCT_ABt:
795     transA = false;
796     transB = true;
797     break;
798   default:
799     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
800   }
801 
802   A = product->A;
803   B = product->B;
804   PetscCall(MatSeqAIJKokkosSyncDevice(A));
805   PetscCall(MatSeqAIJKokkosSyncDevice(B));
806   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
807   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
808   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
809 
810   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
811 
812   csrmatA = akok->csrmat;
813   csrmatB = bkok->csrmat;
814 
815   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
816   if (transA) {
817     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
818     transA = false;
819   }
820 
821   if (transB) {
822     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
823     transB = false;
824   }
825   PetscCall(PetscLogGpuTimeBegin());
826   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
827 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
828   auto spgemmHandle = pdata->kh.get_spgemm_handle();
829   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
830 #endif
831 
832   PetscCall(PetscLogGpuTimeEnd());
833   PetscCall(MatSeqAIJKokkosModifyDevice(C));
834   /* shorter version of MatAssemblyEnd_SeqAIJ */
835   c = (Mat_SeqAIJ *)C->data;
836   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
837   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
838   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
839   c->reallocs         = 0;
840   C->info.mallocs     = 0;
841   C->info.nz_unneeded = 0;
842   C->assembled = C->was_assembled = PETSC_TRUE;
843   C->num_ass++;
844   PetscFunctionReturn(PETSC_SUCCESS);
845 }
846 
847 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
848 {
849   Mat_Product                 *product = C->product;
850   MatProductType               ptype;
851   Mat                          A, B;
852   bool                         transA, transB;
853   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
854   MatProductData_SeqAIJKokkos *pdata;
855   MPI_Comm                     comm;
856   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
857 
858   PetscFunctionBegin;
859   MatCheckProduct(C, 1);
860   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
861   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
862   A = product->A;
863   B = product->B;
864   PetscCall(MatSeqAIJKokkosSyncDevice(A));
865   PetscCall(MatSeqAIJKokkosSyncDevice(B));
866   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
867   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
868   csrmatA = akok->csrmat;
869   csrmatB = bkok->csrmat;
870 
871   ptype = product->type;
872   // Take advantage of the symmetry if true
873   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
874     ptype                                          = MATPRODUCT_AB;
875     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
876   }
877   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
878     ptype                                          = MATPRODUCT_AB;
879     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
880   }
881 
882   switch (ptype) {
883   case MATPRODUCT_AB:
884     transA = false;
885     transB = false;
886     break;
887   case MATPRODUCT_AtB:
888     transA = true;
889     transB = false;
890     break;
891   case MATPRODUCT_ABt:
892     transA = false;
893     transB = true;
894     break;
895   default:
896     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
897   }
898   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
899   pdata->reusesym = product->api_user;
900 
901   /* TODO: add command line options to select spgemm algorithms */
902   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
903 
904   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
905 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
906   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
907   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
908   #endif
909 #endif
910   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
911 
912   PetscCall(PetscLogGpuTimeBegin());
913   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
914   if (transA) {
915     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
916     transA = false;
917   }
918 
919   if (transB) {
920     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
921     transB = false;
922   }
923 
924   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
925   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
926     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
927     calling new Mat_SeqAIJKokkos().
928     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
929   */
930   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
931 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
932   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
933   auto spgemmHandle = pdata->kh.get_spgemm_handle();
934   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
935 #endif
936   PetscCall(PetscLogGpuTimeEnd());
937 
938   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
939   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
940   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
941   PetscFunctionReturn(PETSC_SUCCESS);
942 }
943 
944 /* handles sparse matrix matrix ops */
945 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
946 {
947   Mat_Product *product = mat->product;
948   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
949 
950   PetscFunctionBegin;
951   MatCheckProduct(mat, 1);
952   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
953   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
954   if (Biskok && Ciskok) {
955     switch (product->type) {
956     case MATPRODUCT_AB:
957     case MATPRODUCT_AtB:
958     case MATPRODUCT_ABt:
959       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
960       break;
961     case MATPRODUCT_PtAP:
962     case MATPRODUCT_RARt:
963     case MATPRODUCT_ABC:
964       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
965       break;
966     default:
967       break;
968     }
969   } else { /* fallback for AIJ */
970     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
971   }
972   PetscFunctionReturn(PETSC_SUCCESS);
973 }
974 
975 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
976 {
977   Mat_SeqAIJKokkos *aijkok;
978 
979   PetscFunctionBegin;
980   PetscCall(PetscLogGpuTimeBegin());
981   PetscCall(MatSeqAIJKokkosSyncDevice(A));
982   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
983   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
984   PetscCall(MatSeqAIJKokkosModifyDevice(A));
985   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
986   PetscCall(PetscLogGpuTimeEnd());
987   PetscFunctionReturn(PETSC_SUCCESS);
988 }
989 
990 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
991 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
992 {
993   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
994 
995   PetscFunctionBegin;
996   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
997     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
998 
999     PetscCall(PetscLogGpuTimeBegin());
1000     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1001     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1002     const auto &Aa     = aijkok->a_dual.view_device();
1003     const auto &Adiag  = aijkok->diag_dual.view_device();
1004     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1005     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1006     PetscCall(PetscLogGpuFlops(n));
1007     PetscCall(PetscLogGpuTimeEnd());
1008   } else { // need reassembly, very slow!
1009     PetscCall(MatShift_Basic(A, a));
1010   }
1011   PetscFunctionReturn(PETSC_SUCCESS);
1012 }
1013 
1014 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1015 {
1016   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1017 
1018   PetscFunctionBegin;
1019   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1020     ConstPetscScalarKokkosView dv;
1021     PetscInt                   n, nv;
1022 
1023     PetscCall(PetscLogGpuTimeBegin());
1024     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1025     PetscCall(VecGetKokkosView(D, &dv));
1026     PetscCall(VecGetLocalSize(D, &nv));
1027     n = PetscMin(Y->rmap->n, Y->cmap->n);
1028     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1029 
1030     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1031     const auto &Aa     = aijkok->a_dual.view_device();
1032     const auto &Adiag  = aijkok->diag_dual.view_device();
1033     PetscCallCXX(Kokkos::parallel_for(
1034       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1035         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1036         else Aa(Adiag(i)) += dv(i);
1037       }));
1038     PetscCall(VecRestoreKokkosView(D, &dv));
1039     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1040     PetscCall(PetscLogGpuFlops(n));
1041     PetscCall(PetscLogGpuTimeEnd());
1042   } else { // need reassembly, very slow!
1043     PetscCall(MatDiagonalSet_Default(Y, D, is));
1044   }
1045   PetscFunctionReturn(PETSC_SUCCESS);
1046 }
1047 
1048 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1049 {
1050   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1051   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1052   ConstPetscScalarKokkosView lv, rv;
1053 
1054   PetscFunctionBegin;
1055   PetscCall(PetscLogGpuTimeBegin());
1056   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1057   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1058   const auto &Aa     = aijkok->a_dual.view_device();
1059   const auto &Ai     = aijkok->i_dual.view_device();
1060   const auto &Aj     = aijkok->j_dual.view_device();
1061   if (ll) {
1062     PetscCall(VecGetLocalSize(ll, &m));
1063     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1064     PetscCall(VecGetKokkosView(ll, &lv));
1065     PetscCallCXX(Kokkos::parallel_for( // for each row
1066       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1067         PetscInt i   = t.league_rank(); // row i
1068         PetscInt len = Ai(i + 1) - Ai(i);
1069         // scale entries on the row
1070         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1071       }));
1072     PetscCall(VecRestoreKokkosView(ll, &lv));
1073     PetscCall(PetscLogGpuFlops(nz));
1074   }
1075   if (rr) {
1076     PetscCall(VecGetLocalSize(rr, &n));
1077     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1078     PetscCall(VecGetKokkosView(rr, &rv));
1079     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1080       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1081     PetscCall(VecRestoreKokkosView(rr, &lv));
1082     PetscCall(PetscLogGpuFlops(nz));
1083   }
1084   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1085   PetscCall(PetscLogGpuTimeEnd());
1086   PetscFunctionReturn(PETSC_SUCCESS);
1087 }
1088 
1089 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1090 {
1091   Mat_SeqAIJKokkos *aijkok;
1092 
1093   PetscFunctionBegin;
1094   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1095   if (aijkok) { /* Only zero the device if data is already there */
1096     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1097     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1098   } else { /* Might be preallocated but not assembled */
1099     PetscCall(MatZeroEntries_SeqAIJ(A));
1100   }
1101   PetscFunctionReturn(PETSC_SUCCESS);
1102 }
1103 
1104 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1105 {
1106   Mat_SeqAIJKokkos     *aijkok;
1107   PetscInt              n;
1108   PetscScalarKokkosView xv;
1109 
1110   PetscFunctionBegin;
1111   PetscCall(VecGetLocalSize(x, &n));
1112   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1113   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1114 
1115   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1116   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1117 
1118   const auto &Aa    = aijkok->a_dual.view_device();
1119   const auto &Ai    = aijkok->i_dual.view_device();
1120   const auto &Adiag = aijkok->diag_dual.view_device();
1121 
1122   PetscCall(VecGetKokkosViewWrite(x, &xv));
1123   Kokkos::parallel_for(
1124     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1125       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1126       else xv(i) = 0;
1127     });
1128   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1129   PetscFunctionReturn(PETSC_SUCCESS);
1130 }
1131 
1132 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1133 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1134 {
1135   Mat_SeqAIJKokkos *aijkok;
1136 
1137   PetscFunctionBegin;
1138   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1139   PetscAssertPointer(kv, 2);
1140   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1141   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1142   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1143   *kv    = aijkok->a_dual.view_device();
1144   PetscFunctionReturn(PETSC_SUCCESS);
1145 }
1146 
1147 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1148 {
1149   PetscFunctionBegin;
1150   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1151   PetscAssertPointer(kv, 2);
1152   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1153   PetscFunctionReturn(PETSC_SUCCESS);
1154 }
1155 
1156 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1157 {
1158   Mat_SeqAIJKokkos *aijkok;
1159 
1160   PetscFunctionBegin;
1161   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1162   PetscAssertPointer(kv, 2);
1163   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1164   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1165   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1166   *kv    = aijkok->a_dual.view_device();
1167   PetscFunctionReturn(PETSC_SUCCESS);
1168 }
1169 
1170 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1171 {
1172   PetscFunctionBegin;
1173   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1174   PetscAssertPointer(kv, 2);
1175   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1176   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1177   PetscFunctionReturn(PETSC_SUCCESS);
1178 }
1179 
1180 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1181 {
1182   Mat_SeqAIJKokkos *aijkok;
1183 
1184   PetscFunctionBegin;
1185   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1186   PetscAssertPointer(kv, 2);
1187   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1188   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1189   *kv    = aijkok->a_dual.view_device();
1190   PetscFunctionReturn(PETSC_SUCCESS);
1191 }
1192 
1193 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1194 {
1195   PetscFunctionBegin;
1196   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1197   PetscAssertPointer(kv, 2);
1198   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1199   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1200   PetscFunctionReturn(PETSC_SUCCESS);
1201 }
1202 
1203 /* Computes Y += alpha X */
1204 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1205 {
1206   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1207   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1208   ConstMatScalarKokkosView Xa;
1209   MatScalarKokkosView      Ya;
1210   auto                    &exec = PetscGetKokkosExecutionSpace();
1211 
1212   PetscFunctionBegin;
1213   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1214   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1215   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1216   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1217   PetscCall(PetscLogGpuTimeBegin());
1218 
1219   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1220     PetscBool e;
1221     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1222     if (e) {
1223       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1224       if (e) pattern = SAME_NONZERO_PATTERN;
1225     }
1226   }
1227 
1228   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1229     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1230     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1231     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1232   */
1233   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1234   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1235   Xa   = xkok->a_dual.view_device();
1236   Ya   = ykok->a_dual.view_device();
1237 
1238   if (pattern == SAME_NONZERO_PATTERN) {
1239     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1240     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1241   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1242     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1243     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1244 
1245     Kokkos::parallel_for(
1246       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1247         PetscInt i = t.league_rank(); // row i
1248         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1249           // Only one thread works in a team
1250           PetscInt p, q = Yi(i);
1251           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1252             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1253             if (Xj(p) == Yj(q)) {                        // Found it
1254               Ya(q) += alpha * Xa(p);
1255               q++;
1256             } else {
1257             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1258             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1259 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1260               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1261 #else
1262               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1263 #endif
1264             }
1265           }
1266         });
1267       });
1268     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1269   } else { // different nonzero patterns
1270     Mat             Z;
1271     KokkosCsrMatrix zcsr;
1272     KernelHandle    kh;
1273     kh.create_spadd_handle(true); // X, Y are sorted
1274     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1275     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1276     zkok = new Mat_SeqAIJKokkos(zcsr);
1277     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1278     PetscCall(MatHeaderReplace(Y, &Z));
1279     kh.destroy_spadd_handle();
1280   }
1281   PetscCall(PetscLogGpuTimeEnd());
1282   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1283   PetscFunctionReturn(PETSC_SUCCESS);
1284 }
1285 
1286 struct MatCOOStruct_SeqAIJKokkos {
1287   PetscCount           n;
1288   PetscCount           Atot;
1289   PetscInt             nz;
1290   PetscCountKokkosView jmap;
1291   PetscCountKokkosView perm;
1292 
1293   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1294   {
1295     nz   = coo_h->nz;
1296     n    = coo_h->n;
1297     Atot = coo_h->Atot;
1298     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1299     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1300   }
1301 };
1302 
1303 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1304 {
1305   PetscFunctionBegin;
1306   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1307   PetscFunctionReturn(PETSC_SUCCESS);
1308 }
1309 
1310 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1311 {
1312   Mat_SeqAIJKokkos          *akok;
1313   Mat_SeqAIJ                *aseq;
1314   PetscContainer             container_h;
1315   MatCOOStruct_SeqAIJ       *coo_h;
1316   MatCOOStruct_SeqAIJKokkos *coo_d;
1317 
1318   PetscFunctionBegin;
1319   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1320   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1321   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1322   delete akok;
1323   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1324   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1325 
1326   // Copy the COO struct to device
1327   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1328   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1329   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1330 
1331   // Put the COO struct in a container and then attach that to the matrix
1332   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1333   PetscFunctionReturn(PETSC_SUCCESS);
1334 }
1335 
1336 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1337 {
1338   MatScalarKokkosView        Aa;
1339   ConstMatScalarKokkosView   kv;
1340   PetscMemType               memtype;
1341   PetscContainer             container;
1342   MatCOOStruct_SeqAIJKokkos *coo;
1343 
1344   PetscFunctionBegin;
1345   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1346   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1347 
1348   const auto &n    = coo->n;
1349   const auto &Annz = coo->nz;
1350   const auto &jmap = coo->jmap;
1351   const auto &perm = coo->perm;
1352 
1353   PetscCall(PetscGetMemType(v, &memtype));
1354   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1355     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1356   } else {
1357     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1358   }
1359 
1360   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1361   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1362 
1363   PetscCall(PetscLogGpuTimeBegin());
1364   Kokkos::parallel_for(
1365     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1366       PetscScalar sum = 0.0;
1367       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1368       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1369     });
1370   PetscCall(PetscLogGpuTimeEnd());
1371 
1372   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1373   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1374   PetscFunctionReturn(PETSC_SUCCESS);
1375 }
1376 
1377 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1378 {
1379   PetscFunctionBegin;
1380   PetscCall(MatSeqAIJKokkosSyncHost(A));
1381   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1382   B->offloadmask = PETSC_OFFLOAD_CPU;
1383   PetscFunctionReturn(PETSC_SUCCESS);
1384 }
1385 
1386 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1387 {
1388   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1389 
1390   PetscFunctionBegin;
1391   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1392   A->boundtocpu  = PETSC_FALSE;
1393 
1394   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1395   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1396   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1397   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1398   A->ops->scale                     = MatScale_SeqAIJKokkos;
1399   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1400   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1401   A->ops->mult                      = MatMult_SeqAIJKokkos;
1402   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1403   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1404   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1405   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1406   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1407   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1408   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1409   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1410   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1411   A->ops->shift                     = MatShift_SeqAIJKokkos;
1412   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1413   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1414   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1415   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1416   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1417   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1418   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1419   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1420   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1421 
1422   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1423   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1424 #if defined(PETSC_HAVE_HYPRE)
1425   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1426 #endif
1427   PetscFunctionReturn(PETSC_SUCCESS);
1428 }
1429 
1430 /*
1431    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1432 
1433   Input Parameters:
1434 +  A       - the MATSEQAIJKOKKOS matrix
1435 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1436 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1437 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1438 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1439 
1440   Output Parameter:
1441 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1442 */
1443 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1444 {
1445   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1446   PetscInt          nblocks = bs.extent(0) - 1;
1447 
1448   PetscFunctionBegin;
1449   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1450 
1451   // Pull out the diagonal blocks of the matrix and then invert the blocks
1452   auto Aa    = akok->a_dual.view_device();
1453   auto Ai    = akok->i_dual.view_device();
1454   auto Aj    = akok->j_dual.view_device();
1455   auto Adiag = akok->diag_dual.view_device();
1456   // TODO: how to tune the team size?
1457 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
1458   auto ts = Kokkos::AUTO();
1459 #else
1460   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1461 #endif
1462   PetscCallCXX(Kokkos::parallel_for(
1463     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1464       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1465       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1466       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1467       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1468       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1469 
1470       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1471         PetscInt i = rstart + r;                                                            // i-th row in A
1472 
1473         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1474           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1475 
1476           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1477             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1478               B(r, c) = 0.0;
1479             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1480               B(r, c) = Aa(first + c);
1481             } else { // this entry does not show up in the CSR
1482               B(r, c) = 0.0;
1483             }
1484           }
1485         } else { // rare case that the diagonal does not exist
1486           const PetscInt begin = Ai(i);
1487           const PetscInt end   = Ai(i + 1);
1488           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1489           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1490             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1491             else if (Aj(j) >= rstart + m) break;
1492           }
1493         }
1494       });
1495 
1496       // LU-decompose B (w/o pivoting) and then invert B
1497       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1498       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1499     }));
1500   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1501   PetscFunctionReturn(PETSC_SUCCESS);
1502 }
1503 
1504 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1505 {
1506   Mat_SeqAIJ *aseq;
1507   PetscInt    i, m, n;
1508   auto       &exec = PetscGetKokkosExecutionSpace();
1509 
1510   PetscFunctionBegin;
1511   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1512 
1513   m = akok->nrows();
1514   n = akok->ncols();
1515   PetscCall(MatSetSizes(A, m, n, m, n));
1516   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1517 
1518   /* Set up data structures of A as a MATSEQAIJ */
1519   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1520   aseq = (Mat_SeqAIJ *)A->data;
1521 
1522   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1523   PetscCallCXX(akok->j_dual.sync_host(exec));
1524   PetscCallCXX(exec.fence());
1525 
1526   aseq->i       = akok->i_host_data();
1527   aseq->j       = akok->j_host_data();
1528   aseq->a       = akok->a_host_data();
1529   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1530   aseq->free_a  = PETSC_FALSE;
1531   aseq->free_ij = PETSC_FALSE;
1532   aseq->nz      = akok->nnz();
1533   aseq->maxnz   = aseq->nz;
1534 
1535   PetscCall(PetscMalloc1(m, &aseq->imax));
1536   PetscCall(PetscMalloc1(m, &aseq->ilen));
1537   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1538 
1539   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1540   akok->nonzerostate = A->nonzerostate;
1541   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1542   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1543   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1544   PetscFunctionReturn(PETSC_SUCCESS);
1545 }
1546 
1547 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1548 {
1549   PetscFunctionBegin;
1550   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1551   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1552   PetscFunctionReturn(PETSC_SUCCESS);
1553 }
1554 
1555 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1556 {
1557   Mat_SeqAIJKokkos *akok;
1558 
1559   PetscFunctionBegin;
1560   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1561   PetscCall(MatCreate(comm, A));
1562   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1563   PetscFunctionReturn(PETSC_SUCCESS);
1564 }
1565 
1566 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1567 
1568    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1569  */
1570 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1571 {
1572   PetscFunctionBegin;
1573   PetscCall(MatCreate(comm, A));
1574   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1575   PetscFunctionReturn(PETSC_SUCCESS);
1576 }
1577 
1578 /*@C
1579   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1580   (the default parallel PETSc format). This matrix will ultimately be handled by
1581   Kokkos for calculations.
1582 
1583   Collective
1584 
1585   Input Parameters:
1586 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1587 . m    - number of rows
1588 . n    - number of columns
1589 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1590 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1591 
1592   Output Parameter:
1593 . A - the matrix
1594 
1595   Level: intermediate
1596 
1597   Notes:
1598   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1599   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1600   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1601 
1602   The AIJ format, also called
1603   compressed row storage, is fully compatible with standard Fortran
1604   storage.  That is, the stored row and column indices can begin at
1605   either one (as in Fortran) or zero.
1606 
1607   Specify the preallocated storage with either `nz` or `nnz` (not both).
1608   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1609   allocation.
1610 
1611 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1612 @*/
1613 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1614 {
1615   PetscFunctionBegin;
1616   PetscCall(PetscKokkosInitializeCheck());
1617   PetscCall(MatCreate(comm, A));
1618   PetscCall(MatSetSizes(*A, m, n, m, n));
1619   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1620   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1621   PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623 
1624 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1625 {
1626   PetscFunctionBegin;
1627   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1628   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1629   PetscFunctionReturn(PETSC_SUCCESS);
1630 }
1631 
1632 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1633 {
1634   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1635 
1636   PetscFunctionBegin;
1637   if (!factors->sptrsv_symbolic_completed) {
1638     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1639     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1640     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1641   }
1642   PetscFunctionReturn(PETSC_SUCCESS);
1643 }
1644 
1645 /* Check if we need to update factors etc for transpose solve */
1646 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1647 {
1648   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1649   MatColIdxType               n       = A->rmap->n;
1650 
1651   PetscFunctionBegin;
1652   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1653     /* Update L^T and do sptrsv symbolic */
1654     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1655     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1656     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1657 
1658     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1659                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1660 
1661     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1662       We have to sort the indices, until KK provides finer control options.
1663     */
1664     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1665 
1666     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1667 
1668     /* Update U^T and do sptrsv symbolic */
1669     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1670     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1671     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1672 
1673     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1674                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1675 
1676     /* Sort indices. See comments above */
1677     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1678 
1679     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1680     factors->transpose_updated = PETSC_TRUE;
1681   }
1682   PetscFunctionReturn(PETSC_SUCCESS);
1683 }
1684 
1685 /* Solve Ax = b, with A = LU */
1686 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1687 {
1688   ConstPetscScalarKokkosView  bv;
1689   PetscScalarKokkosView       xv;
1690   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1691 
1692   PetscFunctionBegin;
1693   PetscCall(PetscLogGpuTimeBegin());
1694   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1695   PetscCall(VecGetKokkosView(b, &bv));
1696   PetscCall(VecGetKokkosViewWrite(x, &xv));
1697   /* Solve L tmpv = b */
1698   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1699   /* Solve Ux = tmpv */
1700   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1701   PetscCall(VecRestoreKokkosView(b, &bv));
1702   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1703   PetscCall(PetscLogGpuTimeEnd());
1704   PetscFunctionReturn(PETSC_SUCCESS);
1705 }
1706 
1707 /* Solve A^T x = b, where A^T = U^T L^T */
1708 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1709 {
1710   ConstPetscScalarKokkosView  bv;
1711   PetscScalarKokkosView       xv;
1712   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1713 
1714   PetscFunctionBegin;
1715   PetscCall(PetscLogGpuTimeBegin());
1716   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1717   PetscCall(VecGetKokkosView(b, &bv));
1718   PetscCall(VecGetKokkosViewWrite(x, &xv));
1719   /* Solve U^T tmpv = b */
1720   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1721 
1722   /* Solve L^T x = tmpv */
1723   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1724   PetscCall(VecRestoreKokkosView(b, &bv));
1725   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1726   PetscCall(PetscLogGpuTimeEnd());
1727   PetscFunctionReturn(PETSC_SUCCESS);
1728 }
1729 
1730 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1731 {
1732   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1733   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1734   PetscInt                    fill_lev = info->levels;
1735 
1736   PetscFunctionBegin;
1737   PetscCall(PetscLogGpuTimeBegin());
1738   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1739 
1740   auto a_d = aijkok->a_dual.view_device();
1741   auto i_d = aijkok->i_dual.view_device();
1742   auto j_d = aijkok->j_dual.view_device();
1743 
1744   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1745 
1746   B->assembled              = PETSC_TRUE;
1747   B->preallocated           = PETSC_TRUE;
1748   B->ops->solve             = MatSolve_SeqAIJKokkos;
1749   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1750   B->ops->matsolve          = NULL;
1751   B->ops->matsolvetranspose = NULL;
1752   B->offloadmask            = PETSC_OFFLOAD_GPU;
1753 
1754   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1755   factors->transpose_updated         = PETSC_FALSE;
1756   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1757   /* TODO: log flops, but how to know that? */
1758   PetscCall(PetscLogGpuTimeEnd());
1759   PetscFunctionReturn(PETSC_SUCCESS);
1760 }
1761 
1762 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1763 {
1764   Mat_SeqAIJKokkos           *aijkok;
1765   Mat_SeqAIJ                 *b;
1766   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1767   PetscInt                    fill_lev = info->levels;
1768   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1769   PetscInt                    n        = A->rmap->n;
1770 
1771   PetscFunctionBegin;
1772   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1773   /* Rebuild factors */
1774   if (factors) {
1775     factors->Destroy();
1776   } /* Destroy the old if it exists */
1777   else {
1778     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1779   }
1780 
1781   /* Create a spiluk handle and then do symbolic factorization */
1782   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1783   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1784 
1785   auto spiluk_handle = factors->kh.get_spiluk_handle();
1786 
1787   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1788   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1789   Kokkos::realloc(factors->iU_d, n + 1);
1790   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1791 
1792   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1793   auto i_d = aijkok->i_dual.view_device();
1794   auto j_d = aijkok->j_dual.view_device();
1795   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1796   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1797 
1798   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1799   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1800   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1801   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1802 
1803   /* TODO: add options to select sptrsv algorithms */
1804   /* Create sptrsv handles for L, U and their transpose */
1805 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1806   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1807 #else
1808   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1809 #endif
1810 
1811   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1812   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1813   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1814   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1815 
1816   /* Fill fields of the factor matrix B */
1817   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1818   b     = (Mat_SeqAIJ *)B->data;
1819   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1820   B->info.fill_ratio_given  = info->fill;
1821   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1822 
1823   B->offloadmask          = PETSC_OFFLOAD_GPU;
1824   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1825   PetscFunctionReturn(PETSC_SUCCESS);
1826 }
1827 
1828 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1829 {
1830   PetscFunctionBegin;
1831   *type = MATSOLVERKOKKOS;
1832   PetscFunctionReturn(PETSC_SUCCESS);
1833 }
1834 
1835 /*MC
1836   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1837   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1838 
1839   Level: beginner
1840 
1841 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1842 M*/
1843 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1844 {
1845   PetscInt n = A->rmap->n;
1846 
1847   PetscFunctionBegin;
1848   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1849   PetscCall(MatSetSizes(*B, n, n, n, n));
1850   (*B)->factortype = ftype;
1851   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1852   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1853 
1854   if (ftype == MAT_FACTOR_LU) {
1855     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1856     (*B)->canuseordering        = PETSC_TRUE;
1857     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1858   } else if (ftype == MAT_FACTOR_ILU) {
1859     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1860     (*B)->canuseordering         = PETSC_FALSE;
1861     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1862   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1863 
1864   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1865   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1866   PetscFunctionReturn(PETSC_SUCCESS);
1867 }
1868 
1869 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1870 {
1871   PetscFunctionBegin;
1872   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1873   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1874   PetscFunctionReturn(PETSC_SUCCESS);
1875 }
1876 
1877 /* Utility to print out a KokkosCsrMatrix for debugging */
1878 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1879 {
1880   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1881   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1882   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1883   const PetscInt    *i  = iv.data();
1884   const PetscInt    *j  = jv.data();
1885   const PetscScalar *a  = av.data();
1886   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1887 
1888   PetscFunctionBegin;
1889   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1890   for (PetscInt k = 0; k < m; k++) {
1891     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1892     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1893     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1894   }
1895   PetscFunctionReturn(PETSC_SUCCESS);
1896 }
1897