xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision a29dfd43bb0c77e2653d3bfa2c953f902720a6d2)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscpkg_version.h>
4 #include <petsc/private/petscimpl.h>
5 #include <petsc/private/sfimpl.h>
6 #include <petscsystypes.h>
7 #include <petscerror.h>
8 
9 #include <Kokkos_Core.hpp>
10 #include <KokkosBlas.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 #include <KokkosBatched_LU_Decl.hpp>
18 #include <KokkosBatched_InverseLU_Decl.hpp>
19 
20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
21 
22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
23   #include <KokkosSparse_Utils.hpp>
24 using KokkosSparse::sort_crs_matrix;
25 using KokkosSparse::Impl::transpose_matrix;
26 #else
27   #include <KokkosKernels_Sorting.hpp>
28 using KokkosKernels::sort_crs_matrix;
29 using KokkosKernels::Impl::transpose_matrix;
30 #endif
31 
32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
33 
34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
35    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
36    In the latter case, it is important to set a_dual's sync state correctly.
37  */
38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
39 {
40   Mat_SeqAIJ       *aijseq;
41   Mat_SeqAIJKokkos *aijkok;
42 
43   PetscFunctionBegin;
44   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
45   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
46 
47   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
48   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
49 
50   /* If aijkok does not exist, we just copy i, j to device.
51      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
52      In both cases, we build a new aijkok structure.
53   */
54   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
55     delete aijkok;
56     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
57     A->spptr = aijkok;
58   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
59     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
60     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
61     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
62   }
63   PetscFunctionReturn(PETSC_SUCCESS);
64 }
65 
66 /* Sync CSR data to device if not yet */
67 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
68 {
69   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
70 
71   PetscFunctionBegin;
72   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
73   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
74   if (aijkok->a_dual.need_sync_device()) {
75     aijkok->a_dual.sync_device();
76     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
77     aijkok->hermitian_updated = PETSC_FALSE;
78   }
79   PetscFunctionReturn(PETSC_SUCCESS);
80 }
81 
82 /* Mark the CSR data on device as modified */
83 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
84 {
85   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
86 
87   PetscFunctionBegin;
88   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
89   aijkok->a_dual.clear_sync_state();
90   aijkok->a_dual.modify_device();
91   aijkok->transpose_updated = PETSC_FALSE;
92   aijkok->hermitian_updated = PETSC_FALSE;
93   PetscCall(MatSeqAIJInvalidateDiagonal(A));
94   PetscCall(PetscObjectStateIncrease((PetscObject)A));
95   PetscFunctionReturn(PETSC_SUCCESS);
96 }
97 
98 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
99 {
100   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
101   auto             &exec   = PetscGetKokkosExecutionSpace();
102 
103   PetscFunctionBegin;
104   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
105   /* We do not expect one needs factors on host  */
106   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
107   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
108   PetscCallCXX(aijkok->a_dual.sync_host(exec));
109   PetscCallCXX(exec.fence());
110   PetscFunctionReturn(PETSC_SUCCESS);
111 }
112 
113 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
114 {
115   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
116 
117   PetscFunctionBegin;
118   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
119     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
120     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
121     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
122   */
123   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
124     auto &exec = PetscGetKokkosExecutionSpace();
125     PetscCallCXX(aijkok->a_dual.sync_host(exec));
126     PetscCallCXX(exec.fence());
127     *array = aijkok->a_dual.view_host().data();
128   } else { /* Happens when calling MatSetValues on a newly created matrix */
129     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
130   }
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
135 {
136   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
137 
138   PetscFunctionBegin;
139   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
140   PetscFunctionReturn(PETSC_SUCCESS);
141 }
142 
143 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
144 {
145   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
146 
147   PetscFunctionBegin;
148   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149     auto &exec = PetscGetKokkosExecutionSpace();
150     PetscCallCXX(aijkok->a_dual.sync_host(exec));
151     PetscCallCXX(exec.fence());
152     *array = aijkok->a_dual.view_host().data();
153   } else {
154     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
155   }
156   PetscFunctionReturn(PETSC_SUCCESS);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
160 {
161   PetscFunctionBegin;
162   *array = NULL;
163   PetscFunctionReturn(PETSC_SUCCESS);
164 }
165 
166 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
167 {
168   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
169 
170   PetscFunctionBegin;
171   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
172     *array = aijkok->a_dual.view_host().data();
173   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
174     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
175   }
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
180 {
181   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
182 
183   PetscFunctionBegin;
184   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
185     aijkok->a_dual.clear_sync_state();
186     aijkok->a_dual.modify_host();
187   }
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
192 {
193   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
194 
195   PetscFunctionBegin;
196   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
197 
198   if (i) *i = aijkok->i_device_data();
199   if (j) *j = aijkok->j_device_data();
200   if (a) {
201     aijkok->a_dual.sync_device();
202     *a = aijkok->a_device_data();
203   }
204   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
205   PetscFunctionReturn(PETSC_SUCCESS);
206 }
207 
208 /*
209   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
210 
211   Input Parameter:
212 .  A       - the MATSEQAIJKOKKOS matrix
213 
214   Output Parameters:
215 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
216 -  T_d    - the transpose on device, whose value array is allocated but not initialized
217 */
218 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
219 {
220   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
221   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
222   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
223   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
224   MatRowMapType          *Ti = Ti_h.data();
225   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
226   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
227   PetscInt               *Tj   = Tj_h.data();
228   PetscInt               *perm = perm_h.data();
229   PetscInt               *offset;
230 
231   PetscFunctionBegin;
232   // Populate Ti
233   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
234   Ti++;
235   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
236   Ti--;
237   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
238 
239   // Populate Tj and the permutation array
240   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
241   for (PetscInt i = 0; i < m; i++) {
242     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
243       PetscInt r    = Aj[j];                       // row r of T
244       PetscInt disp = Ti[r] + offset[r];
245 
246       Tj[disp]   = i; // col i of T
247       perm[disp] = j;
248       offset[r]++;
249     }
250   }
251   PetscCall(PetscFree(offset));
252 
253   // Sort each row of T, along with the permutation array
254   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
255 
256   // Output perm and T on device
257   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
258   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
259   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
260   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
261   PetscFunctionReturn(PETSC_SUCCESS);
262 }
263 
264 // Generate the transpose on device and cache it internally
265 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
266 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
267 {
268   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
269   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
270   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
271   KokkosCsrMatrix  &T = akok->csrmatT;
272 
273   PetscFunctionBegin;
274   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
275   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
276 
277   const auto &Aa = akok->a_dual.view_device();
278 
279   if (A->symmetric == PETSC_BOOL3_TRUE) {
280     *csrmatT = akok->csrmat;
281   } else {
282     // See if we already have a cached transpose and its value is up to date
283     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
284       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
285         const auto &perm = akok->transpose_perm; // get the permutation array
286         auto       &Ta   = T.values;
287 
288         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
289       }
290     } else { // Generate T of size n x m for the first time
291       MatRowMapKokkosView perm;
292 
293       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
294       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
295       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
296     }
297     akok->transpose_updated = PETSC_TRUE;
298     *csrmatT                = akok->csrmatT;
299   }
300   PetscFunctionReturn(PETSC_SUCCESS);
301 }
302 
303 // Generate the Hermitian on device and cache it internally
304 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
305 {
306   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
307   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
308   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
309   KokkosCsrMatrix  &T = akok->csrmatH;
310 
311   PetscFunctionBegin;
312   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
313   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
314 
315   const auto &Aa = akok->a_dual.view_device();
316 
317   if (A->hermitian == PETSC_BOOL3_TRUE) {
318     *csrmatH = akok->csrmat;
319   } else {
320     // See if we already have a cached hermitian and its value is up to date
321     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
322       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
323         const auto &perm = akok->transpose_perm; // get the permutation array
324         auto       &Ta   = T.values;
325 
326         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
327       }
328     } else { // Generate T of size n x m for the first time
329       MatRowMapKokkosView perm;
330 
331       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
332       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
333       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
334     }
335     akok->hermitian_updated = PETSC_TRUE;
336     *csrmatH                = akok->csrmatH;
337   }
338   PetscFunctionReturn(PETSC_SUCCESS);
339 }
340 
341 /* y = A x */
342 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
343 {
344   Mat_SeqAIJKokkos          *aijkok;
345   ConstPetscScalarKokkosView xv;
346   PetscScalarKokkosView      yv;
347 
348   PetscFunctionBegin;
349   PetscCall(PetscLogGpuTimeBegin());
350   PetscCall(MatSeqAIJKokkosSyncDevice(A));
351   PetscCall(VecGetKokkosView(xx, &xv));
352   PetscCall(VecGetKokkosViewWrite(yy, &yv));
353   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
354   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
355   PetscCall(VecRestoreKokkosView(xx, &xv));
356   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
357   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
358   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
359   PetscCall(PetscLogGpuTimeEnd());
360   PetscFunctionReturn(PETSC_SUCCESS);
361 }
362 
363 /* y = A^T x */
364 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
365 {
366   Mat_SeqAIJKokkos          *aijkok;
367   const char                *mode;
368   ConstPetscScalarKokkosView xv;
369   PetscScalarKokkosView      yv;
370   KokkosCsrMatrix            csrmat;
371 
372   PetscFunctionBegin;
373   PetscCall(PetscLogGpuTimeBegin());
374   PetscCall(MatSeqAIJKokkosSyncDevice(A));
375   PetscCall(VecGetKokkosView(xx, &xv));
376   PetscCall(VecGetKokkosViewWrite(yy, &yv));
377   if (A->form_explicit_transpose) {
378     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
379     mode = "N";
380   } else {
381     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
382     csrmat = aijkok->csrmat;
383     mode   = "T";
384   }
385   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
386   PetscCall(VecRestoreKokkosView(xx, &xv));
387   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
388   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
389   PetscCall(PetscLogGpuTimeEnd());
390   PetscFunctionReturn(PETSC_SUCCESS);
391 }
392 
393 /* y = A^H x */
394 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
395 {
396   Mat_SeqAIJKokkos          *aijkok;
397   const char                *mode;
398   ConstPetscScalarKokkosView xv;
399   PetscScalarKokkosView      yv;
400   KokkosCsrMatrix            csrmat;
401 
402   PetscFunctionBegin;
403   PetscCall(PetscLogGpuTimeBegin());
404   PetscCall(MatSeqAIJKokkosSyncDevice(A));
405   PetscCall(VecGetKokkosView(xx, &xv));
406   PetscCall(VecGetKokkosViewWrite(yy, &yv));
407   if (A->form_explicit_transpose) {
408     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
409     mode = "N";
410   } else {
411     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
412     csrmat = aijkok->csrmat;
413     mode   = "C";
414   }
415   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
416   PetscCall(VecRestoreKokkosView(xx, &xv));
417   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
418   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
419   PetscCall(PetscLogGpuTimeEnd());
420   PetscFunctionReturn(PETSC_SUCCESS);
421 }
422 
423 /* z = A x + y */
424 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
425 {
426   Mat_SeqAIJKokkos          *aijkok;
427   ConstPetscScalarKokkosView xv;
428   PetscScalarKokkosView      zv;
429 
430   PetscFunctionBegin;
431   PetscCall(PetscLogGpuTimeBegin());
432   PetscCall(MatSeqAIJKokkosSyncDevice(A));
433   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
434   PetscCall(VecGetKokkosView(xx, &xv));
435   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
436   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
437   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
438   PetscCall(VecRestoreKokkosView(xx, &xv));
439   PetscCall(VecRestoreKokkosView(zz, &zv));
440   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
441   PetscCall(PetscLogGpuTimeEnd());
442   PetscFunctionReturn(PETSC_SUCCESS);
443 }
444 
445 /* z = A^T x + y */
446 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
447 {
448   Mat_SeqAIJKokkos          *aijkok;
449   const char                *mode;
450   ConstPetscScalarKokkosView xv;
451   PetscScalarKokkosView      zv;
452   KokkosCsrMatrix            csrmat;
453 
454   PetscFunctionBegin;
455   PetscCall(PetscLogGpuTimeBegin());
456   PetscCall(MatSeqAIJKokkosSyncDevice(A));
457   if (zz != yy) PetscCall(VecCopy(yy, zz));
458   PetscCall(VecGetKokkosView(xx, &xv));
459   PetscCall(VecGetKokkosView(zz, &zv));
460   if (A->form_explicit_transpose) {
461     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
462     mode = "N";
463   } else {
464     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
465     csrmat = aijkok->csrmat;
466     mode   = "T";
467   }
468   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
469   PetscCall(VecRestoreKokkosView(xx, &xv));
470   PetscCall(VecRestoreKokkosView(zz, &zv));
471   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
472   PetscCall(PetscLogGpuTimeEnd());
473   PetscFunctionReturn(PETSC_SUCCESS);
474 }
475 
476 /* z = A^H x + y */
477 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
478 {
479   Mat_SeqAIJKokkos          *aijkok;
480   const char                *mode;
481   ConstPetscScalarKokkosView xv;
482   PetscScalarKokkosView      zv;
483   KokkosCsrMatrix            csrmat;
484 
485   PetscFunctionBegin;
486   PetscCall(PetscLogGpuTimeBegin());
487   PetscCall(MatSeqAIJKokkosSyncDevice(A));
488   if (zz != yy) PetscCall(VecCopy(yy, zz));
489   PetscCall(VecGetKokkosView(xx, &xv));
490   PetscCall(VecGetKokkosView(zz, &zv));
491   if (A->form_explicit_transpose) {
492     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
493     mode = "N";
494   } else {
495     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
496     csrmat = aijkok->csrmat;
497     mode   = "C";
498   }
499   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
500   PetscCall(VecRestoreKokkosView(xx, &xv));
501   PetscCall(VecRestoreKokkosView(zz, &zv));
502   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
503   PetscCall(PetscLogGpuTimeEnd());
504   PetscFunctionReturn(PETSC_SUCCESS);
505 }
506 
507 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
508 {
509   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
510 
511   PetscFunctionBegin;
512   switch (op) {
513   case MAT_FORM_EXPLICIT_TRANSPOSE:
514     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
515     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
516     A->form_explicit_transpose = flg;
517     break;
518   default:
519     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
520     break;
521   }
522   PetscFunctionReturn(PETSC_SUCCESS);
523 }
524 
525 /* Depending on reuse, either build a new mat, or use the existing mat */
526 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
527 {
528   Mat_SeqAIJ *aseq;
529 
530   PetscFunctionBegin;
531   PetscCall(PetscKokkosInitializeCheck());
532   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
533     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
534   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
535     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
536   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
537     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
538     PetscCall(PetscFree(A->defaultvectype));
539     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
540     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
541     PetscCall(MatSetOps_SeqAIJKokkos(A));
542     aseq = static_cast<Mat_SeqAIJ *>(A->data);
543     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
544       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
545       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
546     }
547   }
548   PetscFunctionReturn(PETSC_SUCCESS);
549 }
550 
551 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
552    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
553  */
554 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
555 {
556   Mat_SeqAIJ       *bseq;
557   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
558   Mat               mat;
559 
560   PetscFunctionBegin;
561   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
562   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
563   mat = *B;
564   if (A->assembled) {
565     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
566     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
567     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
568     /* Now copy values to B if needed */
569     if (dupOption == MAT_COPY_VALUES) {
570       if (akok->a_dual.need_sync_device()) {
571         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
572         bkok->a_dual.modify_host();
573       } else { /* If device has the latest data, we only copy data on device */
574         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
575         bkok->a_dual.modify_device();
576       }
577     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
578       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
579       bkok->a_dual.modify_host();
580     }
581     mat->spptr = bkok;
582   }
583 
584   PetscCall(PetscFree(mat->defaultvectype));
585   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
586   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
587   PetscCall(MatSetOps_SeqAIJKokkos(mat));
588   PetscFunctionReturn(PETSC_SUCCESS);
589 }
590 
591 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
592 {
593   Mat               At;
594   KokkosCsrMatrix   internT;
595   Mat_SeqAIJKokkos *atkok, *bkok;
596 
597   PetscFunctionBegin;
598   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
599   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
600   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
601     /* Deep copy internT, as we want to isolate the internal transpose */
602     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
603     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
604     if (reuse == MAT_INITIAL_MATRIX) *B = At;
605     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
606   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
607     if ((*B)->assembled) {
608       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
609       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
610       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
611     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
612       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
613       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
614       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
615       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
616       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
617     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
618   }
619   PetscFunctionReturn(PETSC_SUCCESS);
620 }
621 
622 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
623 {
624   Mat_SeqAIJKokkos *aijkok;
625 
626   PetscFunctionBegin;
627   if (A->factortype == MAT_FACTOR_NONE) {
628     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
629     delete aijkok;
630   } else {
631     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
632   }
633   A->spptr = NULL;
634   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
635   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
636   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
637   PetscCall(MatDestroy_SeqAIJ(A));
638   PetscFunctionReturn(PETSC_SUCCESS);
639 }
640 
641 /*MC
642    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
643 
644    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
645 
646    Options Database Key:
647 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
648 
649   Level: beginner
650 
651 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
652 M*/
653 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
654 {
655   PetscFunctionBegin;
656   PetscCall(PetscKokkosInitializeCheck());
657   PetscCall(MatCreate_SeqAIJ(A));
658   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
659   PetscFunctionReturn(PETSC_SUCCESS);
660 }
661 
662 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
663 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
664 {
665   Mat_SeqAIJ         *a, *b;
666   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
667   MatScalarKokkosView aa, ba, ca;
668   MatRowMapKokkosView ai, bi, ci;
669   MatColIdxKokkosView aj, bj, cj;
670   PetscInt            m, n, nnz, aN;
671 
672   PetscFunctionBegin;
673   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
674   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
675   PetscAssertPointer(C, 4);
676   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
677   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
678   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
679   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
680 
681   PetscCall(MatSeqAIJKokkosSyncDevice(A));
682   PetscCall(MatSeqAIJKokkosSyncDevice(B));
683   a    = static_cast<Mat_SeqAIJ *>(A->data);
684   b    = static_cast<Mat_SeqAIJ *>(B->data);
685   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
686   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
687   aa   = akok->a_dual.view_device();
688   ai   = akok->i_dual.view_device();
689   ba   = bkok->a_dual.view_device();
690   bi   = bkok->i_dual.view_device();
691   m    = A->rmap->n; /* M, N and nnz of C */
692   n    = A->cmap->n + B->cmap->n;
693   nnz  = a->nz + b->nz;
694   aN   = A->cmap->n; /* N of A */
695   if (reuse == MAT_INITIAL_MATRIX) {
696     aj           = akok->j_dual.view_device();
697     bj           = bkok->j_dual.view_device();
698     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
699     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
700     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
701     ca           = ca_dual.view_device();
702     ci           = ci_dual.view_device();
703     cj           = cj_dual.view_device();
704 
705     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
706     Kokkos::parallel_for(
707       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
708         PetscInt i       = t.league_rank(); /* row i */
709         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
710 
711         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
712                                                    ci(i) = coffset;
713                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
714         });
715 
716         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
717           if (k < alen) {
718             ca(coffset + k) = aa(ai(i) + k);
719             cj(coffset + k) = aj(ai(i) + k);
720           } else {
721             ca(coffset + k) = ba(bi(i) + k - alen);
722             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
723           }
724         });
725       });
726     ca_dual.modify_device();
727     ci_dual.modify_device();
728     cj_dual.modify_device();
729     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
730     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
731   } else if (reuse == MAT_REUSE_MATRIX) {
732     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
733     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
734     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
735     ca   = ckok->a_dual.view_device();
736     ci   = ckok->i_dual.view_device();
737 
738     Kokkos::parallel_for(
739       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
740         PetscInt i    = t.league_rank(); /* row i */
741         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
742         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
743           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
744           else ca(ci(i) + k) = ba(bi(i) + k - alen);
745         });
746       });
747     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
748   }
749   PetscFunctionReturn(PETSC_SUCCESS);
750 }
751 
752 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
753 {
754   PetscFunctionBegin;
755   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
756   PetscFunctionReturn(PETSC_SUCCESS);
757 }
758 
759 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
760 {
761   Mat_Product                 *product = C->product;
762   Mat                          A, B;
763   bool                         transA, transB; /* use bool, since KK needs this type */
764   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
765   Mat_SeqAIJ                  *c;
766   MatProductData_SeqAIJKokkos *pdata;
767   KokkosCsrMatrix              csrmatA, csrmatB;
768 
769   PetscFunctionBegin;
770   MatCheckProduct(C, 1);
771   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
772   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
773 
774   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
775   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
776   // we still do numeric.
777   if (pdata->reusesym) { // numeric reuses results from symbolic
778     pdata->reusesym = PETSC_FALSE;
779     PetscFunctionReturn(PETSC_SUCCESS);
780   }
781 
782   switch (product->type) {
783   case MATPRODUCT_AB:
784     transA = false;
785     transB = false;
786     break;
787   case MATPRODUCT_AtB:
788     transA = true;
789     transB = false;
790     break;
791   case MATPRODUCT_ABt:
792     transA = false;
793     transB = true;
794     break;
795   default:
796     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
797   }
798 
799   A = product->A;
800   B = product->B;
801   PetscCall(MatSeqAIJKokkosSyncDevice(A));
802   PetscCall(MatSeqAIJKokkosSyncDevice(B));
803   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
804   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
805   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
806 
807   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
808 
809   csrmatA = akok->csrmat;
810   csrmatB = bkok->csrmat;
811 
812   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
813   if (transA) {
814     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
815     transA = false;
816   }
817 
818   if (transB) {
819     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
820     transB = false;
821   }
822   PetscCall(PetscLogGpuTimeBegin());
823   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
824 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
825   auto spgemmHandle = pdata->kh.get_spgemm_handle();
826   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
827 #endif
828 
829   PetscCall(PetscLogGpuTimeEnd());
830   PetscCall(MatSeqAIJKokkosModifyDevice(C));
831   /* shorter version of MatAssemblyEnd_SeqAIJ */
832   c = (Mat_SeqAIJ *)C->data;
833   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
834   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
835   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
836   c->reallocs         = 0;
837   C->info.mallocs     = 0;
838   C->info.nz_unneeded = 0;
839   C->assembled = C->was_assembled = PETSC_TRUE;
840   C->num_ass++;
841   PetscFunctionReturn(PETSC_SUCCESS);
842 }
843 
844 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
845 {
846   Mat_Product                 *product = C->product;
847   MatProductType               ptype;
848   Mat                          A, B;
849   bool                         transA, transB;
850   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
851   MatProductData_SeqAIJKokkos *pdata;
852   MPI_Comm                     comm;
853   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
854 
855   PetscFunctionBegin;
856   MatCheckProduct(C, 1);
857   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
858   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
859   A = product->A;
860   B = product->B;
861   PetscCall(MatSeqAIJKokkosSyncDevice(A));
862   PetscCall(MatSeqAIJKokkosSyncDevice(B));
863   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
864   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
865   csrmatA = akok->csrmat;
866   csrmatB = bkok->csrmat;
867 
868   ptype = product->type;
869   // Take advantage of the symmetry if true
870   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
871     ptype                                          = MATPRODUCT_AB;
872     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
873   }
874   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
875     ptype                                          = MATPRODUCT_AB;
876     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
877   }
878 
879   switch (ptype) {
880   case MATPRODUCT_AB:
881     transA = false;
882     transB = false;
883     break;
884   case MATPRODUCT_AtB:
885     transA = true;
886     transB = false;
887     break;
888   case MATPRODUCT_ABt:
889     transA = false;
890     transB = true;
891     break;
892   default:
893     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
894   }
895   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
896   pdata->reusesym = product->api_user;
897 
898   /* TODO: add command line options to select spgemm algorithms */
899   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
900 
901   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
902 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
903   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
904   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
905   #endif
906 #endif
907   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
908 
909   PetscCall(PetscLogGpuTimeBegin());
910   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
911   if (transA) {
912     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
913     transA = false;
914   }
915 
916   if (transB) {
917     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
918     transB = false;
919   }
920 
921   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
922   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
923     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
924     calling new Mat_SeqAIJKokkos().
925     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
926   */
927   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
928 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
929   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
930   auto spgemmHandle = pdata->kh.get_spgemm_handle();
931   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
932 #endif
933   PetscCall(PetscLogGpuTimeEnd());
934 
935   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
936   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
937   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
938   PetscFunctionReturn(PETSC_SUCCESS);
939 }
940 
941 /* handles sparse matrix matrix ops */
942 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
943 {
944   Mat_Product *product = mat->product;
945   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
946 
947   PetscFunctionBegin;
948   MatCheckProduct(mat, 1);
949   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
950   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
951   if (Biskok && Ciskok) {
952     switch (product->type) {
953     case MATPRODUCT_AB:
954     case MATPRODUCT_AtB:
955     case MATPRODUCT_ABt:
956       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
957       break;
958     case MATPRODUCT_PtAP:
959     case MATPRODUCT_RARt:
960     case MATPRODUCT_ABC:
961       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
962       break;
963     default:
964       break;
965     }
966   } else { /* fallback for AIJ */
967     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
968   }
969   PetscFunctionReturn(PETSC_SUCCESS);
970 }
971 
972 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
973 {
974   Mat_SeqAIJKokkos *aijkok;
975 
976   PetscFunctionBegin;
977   PetscCall(PetscLogGpuTimeBegin());
978   PetscCall(MatSeqAIJKokkosSyncDevice(A));
979   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
980   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
981   PetscCall(MatSeqAIJKokkosModifyDevice(A));
982   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
983   PetscCall(PetscLogGpuTimeEnd());
984   PetscFunctionReturn(PETSC_SUCCESS);
985 }
986 
987 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
988 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
989 {
990   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
991 
992   PetscFunctionBegin;
993   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
994     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
995 
996     PetscCall(PetscLogGpuTimeBegin());
997     PetscCall(MatSeqAIJKokkosSyncDevice(A));
998     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
999     const auto &Aa     = aijkok->a_dual.view_device();
1000     const auto &Adiag  = aijkok->diag_dual.view_device();
1001     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1002     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1003     PetscCall(PetscLogGpuFlops(n));
1004     PetscCall(PetscLogGpuTimeEnd());
1005   } else { // need reassembly, very slow!
1006     PetscCall(MatShift_Basic(A, a));
1007   }
1008   PetscFunctionReturn(PETSC_SUCCESS);
1009 }
1010 
1011 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1012 {
1013   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1014 
1015   PetscFunctionBegin;
1016   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1017     ConstPetscScalarKokkosView dv;
1018     PetscInt                   n, nv;
1019 
1020     PetscCall(PetscLogGpuTimeBegin());
1021     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1022     PetscCall(VecGetKokkosView(D, &dv));
1023     PetscCall(VecGetLocalSize(D, &nv));
1024     n = PetscMin(Y->rmap->n, Y->cmap->n);
1025     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1026 
1027     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1028     const auto &Aa     = aijkok->a_dual.view_device();
1029     const auto &Adiag  = aijkok->diag_dual.view_device();
1030     PetscCallCXX(Kokkos::parallel_for(
1031       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1032         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1033         else Aa(Adiag(i)) += dv(i);
1034       }));
1035     PetscCall(VecRestoreKokkosView(D, &dv));
1036     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1037     PetscCall(PetscLogGpuFlops(n));
1038     PetscCall(PetscLogGpuTimeEnd());
1039   } else { // need reassembly, very slow!
1040     PetscCall(MatDiagonalSet_Default(Y, D, is));
1041   }
1042   PetscFunctionReturn(PETSC_SUCCESS);
1043 }
1044 
1045 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1046 {
1047   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1048   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1049   ConstPetscScalarKokkosView lv, rv;
1050 
1051   PetscFunctionBegin;
1052   PetscCall(PetscLogGpuTimeBegin());
1053   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1054   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1055   const auto &Aa     = aijkok->a_dual.view_device();
1056   const auto &Ai     = aijkok->i_dual.view_device();
1057   const auto &Aj     = aijkok->j_dual.view_device();
1058   if (ll) {
1059     PetscCall(VecGetLocalSize(ll, &m));
1060     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1061     PetscCall(VecGetKokkosView(ll, &lv));
1062     PetscCallCXX(Kokkos::parallel_for( // for each row
1063       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1064         PetscInt i   = t.league_rank(); // row i
1065         PetscInt len = Ai(i + 1) - Ai(i);
1066         // scale entries on the row
1067         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1068       }));
1069     PetscCall(VecRestoreKokkosView(ll, &lv));
1070     PetscCall(PetscLogGpuFlops(nz));
1071   }
1072   if (rr) {
1073     PetscCall(VecGetLocalSize(rr, &n));
1074     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1075     PetscCall(VecGetKokkosView(rr, &rv));
1076     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1077       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1078     PetscCall(VecRestoreKokkosView(rr, &lv));
1079     PetscCall(PetscLogGpuFlops(nz));
1080   }
1081   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1082   PetscCall(PetscLogGpuTimeEnd());
1083   PetscFunctionReturn(PETSC_SUCCESS);
1084 }
1085 
1086 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1087 {
1088   Mat_SeqAIJKokkos *aijkok;
1089 
1090   PetscFunctionBegin;
1091   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1092   if (aijkok) { /* Only zero the device if data is already there */
1093     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1094     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1095   } else { /* Might be preallocated but not assembled */
1096     PetscCall(MatZeroEntries_SeqAIJ(A));
1097   }
1098   PetscFunctionReturn(PETSC_SUCCESS);
1099 }
1100 
1101 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1102 {
1103   Mat_SeqAIJKokkos     *aijkok;
1104   PetscInt              n;
1105   PetscScalarKokkosView xv;
1106 
1107   PetscFunctionBegin;
1108   PetscCall(VecGetLocalSize(x, &n));
1109   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1110   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1111 
1112   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1113   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1114 
1115   const auto &Aa    = aijkok->a_dual.view_device();
1116   const auto &Ai    = aijkok->i_dual.view_device();
1117   const auto &Adiag = aijkok->diag_dual.view_device();
1118 
1119   PetscCall(VecGetKokkosViewWrite(x, &xv));
1120   Kokkos::parallel_for(
1121     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1122       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1123       else xv(i) = 0;
1124     });
1125   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1126   PetscFunctionReturn(PETSC_SUCCESS);
1127 }
1128 
1129 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1130 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1131 {
1132   Mat_SeqAIJKokkos *aijkok;
1133 
1134   PetscFunctionBegin;
1135   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1136   PetscAssertPointer(kv, 2);
1137   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1138   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1139   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1140   *kv    = aijkok->a_dual.view_device();
1141   PetscFunctionReturn(PETSC_SUCCESS);
1142 }
1143 
1144 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1145 {
1146   PetscFunctionBegin;
1147   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1148   PetscAssertPointer(kv, 2);
1149   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1150   PetscFunctionReturn(PETSC_SUCCESS);
1151 }
1152 
1153 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1154 {
1155   Mat_SeqAIJKokkos *aijkok;
1156 
1157   PetscFunctionBegin;
1158   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1159   PetscAssertPointer(kv, 2);
1160   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1161   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1162   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1163   *kv    = aijkok->a_dual.view_device();
1164   PetscFunctionReturn(PETSC_SUCCESS);
1165 }
1166 
1167 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1168 {
1169   PetscFunctionBegin;
1170   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1171   PetscAssertPointer(kv, 2);
1172   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1173   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1174   PetscFunctionReturn(PETSC_SUCCESS);
1175 }
1176 
1177 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1178 {
1179   Mat_SeqAIJKokkos *aijkok;
1180 
1181   PetscFunctionBegin;
1182   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1183   PetscAssertPointer(kv, 2);
1184   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1185   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1186   *kv    = aijkok->a_dual.view_device();
1187   PetscFunctionReturn(PETSC_SUCCESS);
1188 }
1189 
1190 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1191 {
1192   PetscFunctionBegin;
1193   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1194   PetscAssertPointer(kv, 2);
1195   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1196   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1197   PetscFunctionReturn(PETSC_SUCCESS);
1198 }
1199 
1200 /* Computes Y += alpha X */
1201 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1202 {
1203   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1204   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1205   ConstMatScalarKokkosView Xa;
1206   MatScalarKokkosView      Ya;
1207   auto                    &exec = PetscGetKokkosExecutionSpace();
1208 
1209   PetscFunctionBegin;
1210   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1211   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1212   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1213   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1214   PetscCall(PetscLogGpuTimeBegin());
1215 
1216   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1217     PetscBool e;
1218     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1219     if (e) {
1220       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1221       if (e) pattern = SAME_NONZERO_PATTERN;
1222     }
1223   }
1224 
1225   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1226     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1227     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1228     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1229   */
1230   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1231   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1232   Xa   = xkok->a_dual.view_device();
1233   Ya   = ykok->a_dual.view_device();
1234 
1235   if (pattern == SAME_NONZERO_PATTERN) {
1236     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1237     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1238   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1239     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1240     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1241 
1242     Kokkos::parallel_for(
1243       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1244         PetscInt i = t.league_rank(); // row i
1245         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1246           // Only one thread works in a team
1247           PetscInt p, q = Yi(i);
1248           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1249             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1250             if (Xj(p) == Yj(q)) {                        // Found it
1251               Ya(q) += alpha * Xa(p);
1252               q++;
1253             } else {
1254             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1255             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1256 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1257               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1258 #else
1259               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1260 #endif
1261             }
1262           }
1263         });
1264       });
1265     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1266   } else { // different nonzero patterns
1267     Mat             Z;
1268     KokkosCsrMatrix zcsr;
1269     KernelHandle    kh;
1270     kh.create_spadd_handle(true); // X, Y are sorted
1271     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1272     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1273     zkok = new Mat_SeqAIJKokkos(zcsr);
1274     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1275     PetscCall(MatHeaderReplace(Y, &Z));
1276     kh.destroy_spadd_handle();
1277   }
1278   PetscCall(PetscLogGpuTimeEnd());
1279   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1280   PetscFunctionReturn(PETSC_SUCCESS);
1281 }
1282 
1283 struct MatCOOStruct_SeqAIJKokkos {
1284   PetscCount           n;
1285   PetscCount           Atot;
1286   PetscInt             nz;
1287   PetscCountKokkosView jmap;
1288   PetscCountKokkosView perm;
1289 
1290   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1291   {
1292     nz   = coo_h->nz;
1293     n    = coo_h->n;
1294     Atot = coo_h->Atot;
1295     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1296     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1297   }
1298 };
1299 
1300 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
1301 {
1302   PetscFunctionBegin;
1303   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
1304   PetscFunctionReturn(PETSC_SUCCESS);
1305 }
1306 
1307 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1308 {
1309   Mat_SeqAIJKokkos          *akok;
1310   Mat_SeqAIJ                *aseq;
1311   PetscContainer             container_h, container_d;
1312   MatCOOStruct_SeqAIJ       *coo_h;
1313   MatCOOStruct_SeqAIJKokkos *coo_d;
1314 
1315   PetscFunctionBegin;
1316   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1317   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1318   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1319   delete akok;
1320   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1321   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1322 
1323   // Copy the COO struct to device
1324   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1325   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1326   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1327 
1328   // Put the COO struct in a container and then attach that to the matrix
1329   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
1330   PetscCall(PetscContainerSetPointer(container_d, coo_d));
1331   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos));
1332   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
1333   PetscCall(PetscContainerDestroy(&container_d));
1334   PetscFunctionReturn(PETSC_SUCCESS);
1335 }
1336 
1337 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1338 {
1339   MatScalarKokkosView        Aa;
1340   ConstMatScalarKokkosView   kv;
1341   PetscMemType               memtype;
1342   PetscContainer             container;
1343   MatCOOStruct_SeqAIJKokkos *coo;
1344 
1345   PetscFunctionBegin;
1346   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1347   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1348 
1349   const auto &n    = coo->n;
1350   const auto &Annz = coo->nz;
1351   const auto &jmap = coo->jmap;
1352   const auto &perm = coo->perm;
1353 
1354   PetscCall(PetscGetMemType(v, &memtype));
1355   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1356     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1357   } else {
1358     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1359   }
1360 
1361   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1362   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1363 
1364   PetscCall(PetscLogGpuTimeBegin());
1365   Kokkos::parallel_for(
1366     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1367       PetscScalar sum = 0.0;
1368       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1369       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1370     });
1371   PetscCall(PetscLogGpuTimeEnd());
1372 
1373   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1374   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1375   PetscFunctionReturn(PETSC_SUCCESS);
1376 }
1377 
1378 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1379 {
1380   PetscFunctionBegin;
1381   PetscCall(MatSeqAIJKokkosSyncHost(A));
1382   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1383   B->offloadmask = PETSC_OFFLOAD_CPU;
1384   PetscFunctionReturn(PETSC_SUCCESS);
1385 }
1386 
1387 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1388 {
1389   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1390 
1391   PetscFunctionBegin;
1392   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1393   A->boundtocpu  = PETSC_FALSE;
1394 
1395   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1396   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1397   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1398   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1399   A->ops->scale                     = MatScale_SeqAIJKokkos;
1400   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1401   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1402   A->ops->mult                      = MatMult_SeqAIJKokkos;
1403   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1404   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1405   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1406   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1407   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1408   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1409   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1410   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1411   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1412   A->ops->shift                     = MatShift_SeqAIJKokkos;
1413   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1414   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1415   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1416   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1417   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1418   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1419   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1420   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1421   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1422 
1423   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1424   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1425   PetscFunctionReturn(PETSC_SUCCESS);
1426 }
1427 
1428 /*
1429    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1430 
1431   Input Parameters:
1432 +  A       - the MATSEQAIJKOKKOS matrix
1433 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1434 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1435 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1436 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1437 
1438   Output Parameter:
1439 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1440 */
1441 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1442 {
1443   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1444   PetscInt          nblocks = bs.extent(0) - 1;
1445 
1446   PetscFunctionBegin;
1447   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1448 
1449   // Pull out the diagonal blocks of the matrix and then invert the blocks
1450   auto Aa    = akok->a_dual.view_device();
1451   auto Ai    = akok->i_dual.view_device();
1452   auto Aj    = akok->j_dual.view_device();
1453   auto Adiag = akok->diag_dual.view_device();
1454   // TODO: how to tune the team size?
1455 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
1456   auto ts = Kokkos::AUTO();
1457 #else
1458   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1459 #endif
1460   PetscCallCXX(Kokkos::parallel_for(
1461     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1462       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1463       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1464       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1465       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1466       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1467 
1468       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1469         PetscInt i = rstart + r;                                                            // i-th row in A
1470 
1471         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1472           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1473 
1474           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1475             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1476               B(r, c) = 0.0;
1477             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1478               B(r, c) = Aa(first + c);
1479             } else { // this entry does not show up in the CSR
1480               B(r, c) = 0.0;
1481             }
1482           }
1483         } else { // rare case that the diagonal does not exist
1484           const PetscInt begin = Ai(i);
1485           const PetscInt end   = Ai(i + 1);
1486           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1487           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1488             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1489             else if (Aj(j) >= rstart + m) break;
1490           }
1491         }
1492       });
1493 
1494       // LU-decompose B (w/o pivoting) and then invert B
1495       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1496       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1497     }));
1498   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1499   PetscFunctionReturn(PETSC_SUCCESS);
1500 }
1501 
1502 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1503 {
1504   Mat_SeqAIJ *aseq;
1505   PetscInt    i, m, n;
1506   auto       &exec = PetscGetKokkosExecutionSpace();
1507 
1508   PetscFunctionBegin;
1509   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1510 
1511   m = akok->nrows();
1512   n = akok->ncols();
1513   PetscCall(MatSetSizes(A, m, n, m, n));
1514   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1515 
1516   /* Set up data structures of A as a MATSEQAIJ */
1517   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1518   aseq = (Mat_SeqAIJ *)(A)->data;
1519 
1520   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1521   PetscCallCXX(akok->j_dual.sync_host(exec));
1522   PetscCallCXX(exec.fence());
1523 
1524   aseq->i       = akok->i_host_data();
1525   aseq->j       = akok->j_host_data();
1526   aseq->a       = akok->a_host_data();
1527   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1528   aseq->free_a  = PETSC_FALSE;
1529   aseq->free_ij = PETSC_FALSE;
1530   aseq->nz      = akok->nnz();
1531   aseq->maxnz   = aseq->nz;
1532 
1533   PetscCall(PetscMalloc1(m, &aseq->imax));
1534   PetscCall(PetscMalloc1(m, &aseq->ilen));
1535   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1536 
1537   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1538   akok->nonzerostate = A->nonzerostate;
1539   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1540   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1541   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1542   PetscFunctionReturn(PETSC_SUCCESS);
1543 }
1544 
1545 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1546 {
1547   PetscFunctionBegin;
1548   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1549   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1550   PetscFunctionReturn(PETSC_SUCCESS);
1551 }
1552 
1553 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1554 {
1555   Mat_SeqAIJKokkos *akok;
1556 
1557   PetscFunctionBegin;
1558   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1559   PetscCall(MatCreate(comm, A));
1560   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1561   PetscFunctionReturn(PETSC_SUCCESS);
1562 }
1563 
1564 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1565 
1566    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1567  */
1568 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1569 {
1570   PetscFunctionBegin;
1571   PetscCall(MatCreate(comm, A));
1572   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1573   PetscFunctionReturn(PETSC_SUCCESS);
1574 }
1575 
1576 /*@C
1577   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1578   (the default parallel PETSc format). This matrix will ultimately be handled by
1579   Kokkos for calculations.
1580 
1581   Collective
1582 
1583   Input Parameters:
1584 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1585 . m    - number of rows
1586 . n    - number of columns
1587 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1588 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1589 
1590   Output Parameter:
1591 . A - the matrix
1592 
1593   Level: intermediate
1594 
1595   Notes:
1596   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1597   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1598   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1599 
1600   The AIJ format, also called
1601   compressed row storage, is fully compatible with standard Fortran
1602   storage.  That is, the stored row and column indices can begin at
1603   either one (as in Fortran) or zero.
1604 
1605   Specify the preallocated storage with either `nz` or `nnz` (not both).
1606   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1607   allocation.
1608 
1609 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1610 @*/
1611 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1612 {
1613   PetscFunctionBegin;
1614   PetscCall(PetscKokkosInitializeCheck());
1615   PetscCall(MatCreate(comm, A));
1616   PetscCall(MatSetSizes(*A, m, n, m, n));
1617   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1618   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1619   PetscFunctionReturn(PETSC_SUCCESS);
1620 }
1621 
1622 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1623 {
1624   PetscFunctionBegin;
1625   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1626   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1627   PetscFunctionReturn(PETSC_SUCCESS);
1628 }
1629 
1630 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1631 {
1632   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1633 
1634   PetscFunctionBegin;
1635   if (!factors->sptrsv_symbolic_completed) {
1636     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1637     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1638     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1639   }
1640   PetscFunctionReturn(PETSC_SUCCESS);
1641 }
1642 
1643 /* Check if we need to update factors etc for transpose solve */
1644 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1645 {
1646   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1647   MatColIdxType               n       = A->rmap->n;
1648 
1649   PetscFunctionBegin;
1650   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1651     /* Update L^T and do sptrsv symbolic */
1652     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1653     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1654     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1655 
1656     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1657                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1658 
1659     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1660       We have to sort the indices, until KK provides finer control options.
1661     */
1662     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1663 
1664     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1665 
1666     /* Update U^T and do sptrsv symbolic */
1667     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1668     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1669     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1670 
1671     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1672                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1673 
1674     /* Sort indices. See comments above */
1675     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1676 
1677     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1678     factors->transpose_updated = PETSC_TRUE;
1679   }
1680   PetscFunctionReturn(PETSC_SUCCESS);
1681 }
1682 
1683 /* Solve Ax = b, with A = LU */
1684 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1685 {
1686   ConstPetscScalarKokkosView  bv;
1687   PetscScalarKokkosView       xv;
1688   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1689 
1690   PetscFunctionBegin;
1691   PetscCall(PetscLogGpuTimeBegin());
1692   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1693   PetscCall(VecGetKokkosView(b, &bv));
1694   PetscCall(VecGetKokkosViewWrite(x, &xv));
1695   /* Solve L tmpv = b */
1696   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1697   /* Solve Ux = tmpv */
1698   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1699   PetscCall(VecRestoreKokkosView(b, &bv));
1700   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1701   PetscCall(PetscLogGpuTimeEnd());
1702   PetscFunctionReturn(PETSC_SUCCESS);
1703 }
1704 
1705 /* Solve A^T x = b, where A^T = U^T L^T */
1706 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1707 {
1708   ConstPetscScalarKokkosView  bv;
1709   PetscScalarKokkosView       xv;
1710   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1711 
1712   PetscFunctionBegin;
1713   PetscCall(PetscLogGpuTimeBegin());
1714   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1715   PetscCall(VecGetKokkosView(b, &bv));
1716   PetscCall(VecGetKokkosViewWrite(x, &xv));
1717   /* Solve U^T tmpv = b */
1718   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1719 
1720   /* Solve L^T x = tmpv */
1721   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1722   PetscCall(VecRestoreKokkosView(b, &bv));
1723   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1724   PetscCall(PetscLogGpuTimeEnd());
1725   PetscFunctionReturn(PETSC_SUCCESS);
1726 }
1727 
1728 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1729 {
1730   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1731   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1732   PetscInt                    fill_lev = info->levels;
1733 
1734   PetscFunctionBegin;
1735   PetscCall(PetscLogGpuTimeBegin());
1736   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1737 
1738   auto a_d = aijkok->a_dual.view_device();
1739   auto i_d = aijkok->i_dual.view_device();
1740   auto j_d = aijkok->j_dual.view_device();
1741 
1742   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1743 
1744   B->assembled              = PETSC_TRUE;
1745   B->preallocated           = PETSC_TRUE;
1746   B->ops->solve             = MatSolve_SeqAIJKokkos;
1747   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1748   B->ops->matsolve          = NULL;
1749   B->ops->matsolvetranspose = NULL;
1750   B->offloadmask            = PETSC_OFFLOAD_GPU;
1751 
1752   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1753   factors->transpose_updated         = PETSC_FALSE;
1754   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1755   /* TODO: log flops, but how to know that? */
1756   PetscCall(PetscLogGpuTimeEnd());
1757   PetscFunctionReturn(PETSC_SUCCESS);
1758 }
1759 
1760 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1761 {
1762   Mat_SeqAIJKokkos           *aijkok;
1763   Mat_SeqAIJ                 *b;
1764   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1765   PetscInt                    fill_lev = info->levels;
1766   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1767   PetscInt                    n        = A->rmap->n;
1768 
1769   PetscFunctionBegin;
1770   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1771   /* Rebuild factors */
1772   if (factors) {
1773     factors->Destroy();
1774   } /* Destroy the old if it exists */
1775   else {
1776     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1777   }
1778 
1779   /* Create a spiluk handle and then do symbolic factorization */
1780   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1781   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1782 
1783   auto spiluk_handle = factors->kh.get_spiluk_handle();
1784 
1785   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1786   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1787   Kokkos::realloc(factors->iU_d, n + 1);
1788   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1789 
1790   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1791   auto i_d = aijkok->i_dual.view_device();
1792   auto j_d = aijkok->j_dual.view_device();
1793   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1794   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1795 
1796   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1797   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1798   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1799   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1800 
1801   /* TODO: add options to select sptrsv algorithms */
1802   /* Create sptrsv handles for L, U and their transpose */
1803 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1804   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1805 #else
1806   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1807 #endif
1808 
1809   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1810   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1811   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1812   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1813 
1814   /* Fill fields of the factor matrix B */
1815   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1816   b     = (Mat_SeqAIJ *)B->data;
1817   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1818   B->info.fill_ratio_given  = info->fill;
1819   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1820 
1821   B->offloadmask          = PETSC_OFFLOAD_GPU;
1822   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1823   PetscFunctionReturn(PETSC_SUCCESS);
1824 }
1825 
1826 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1827 {
1828   PetscFunctionBegin;
1829   *type = MATSOLVERKOKKOS;
1830   PetscFunctionReturn(PETSC_SUCCESS);
1831 }
1832 
1833 /*MC
1834   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1835   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1836 
1837   Level: beginner
1838 
1839 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1840 M*/
1841 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1842 {
1843   PetscInt n = A->rmap->n;
1844 
1845   PetscFunctionBegin;
1846   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1847   PetscCall(MatSetSizes(*B, n, n, n, n));
1848   (*B)->factortype = ftype;
1849   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1850   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1851 
1852   if (ftype == MAT_FACTOR_LU) {
1853     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1854     (*B)->canuseordering        = PETSC_TRUE;
1855     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1856   } else if (ftype == MAT_FACTOR_ILU) {
1857     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1858     (*B)->canuseordering         = PETSC_FALSE;
1859     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1860   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1861 
1862   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1863   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1864   PetscFunctionReturn(PETSC_SUCCESS);
1865 }
1866 
1867 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1868 {
1869   PetscFunctionBegin;
1870   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1871   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1872   PetscFunctionReturn(PETSC_SUCCESS);
1873 }
1874 
1875 /* Utility to print out a KokkosCsrMatrix for debugging */
1876 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1877 {
1878   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1879   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1880   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1881   const PetscInt    *i  = iv.data();
1882   const PetscInt    *j  = jv.data();
1883   const PetscScalar *a  = av.data();
1884   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1885 
1886   PetscFunctionBegin;
1887   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1888   for (PetscInt k = 0; k < m; k++) {
1889     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1890     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1891     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1892   }
1893   PetscFunctionReturn(PETSC_SUCCESS);
1894 }
1895