xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 6e415bd26dfd0d33b8d7233bdee8a51de4844a26)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscpkg_version.h>
4 #include <petsc/private/petscimpl.h>
5 #include <petsc/private/sfimpl.h>
6 #include <petscsystypes.h>
7 #include <petscerror.h>
8 
9 #include <Kokkos_Core.hpp>
10 #include <KokkosBlas.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 #include <KokkosBatched_LU_Decl.hpp>
18 #include <KokkosBatched_InverseLU_Decl.hpp>
19 
20 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
21 
22 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
23   #include <KokkosSparse_Utils.hpp>
24 using KokkosSparse::sort_crs_matrix;
25 using KokkosSparse::Impl::transpose_matrix;
26 #else
27   #include <KokkosKernels_Sorting.hpp>
28 using KokkosKernels::sort_crs_matrix;
29 using KokkosKernels::Impl::transpose_matrix;
30 #endif
31 
32 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
33 
34 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
35    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
36    In the latter case, it is important to set a_dual's sync state correctly.
37  */
38 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
39 {
40   Mat_SeqAIJ       *aijseq;
41   Mat_SeqAIJKokkos *aijkok;
42 
43   PetscFunctionBegin;
44   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
45   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
46 
47   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
48   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
49 
50   /* If aijkok does not exist, we just copy i, j to device.
51      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
52      In both cases, we build a new aijkok structure.
53   */
54   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
55     delete aijkok;
56     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
57     A->spptr = aijkok;
58   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
59     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
60     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
61     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
62   }
63   PetscFunctionReturn(PETSC_SUCCESS);
64 }
65 
66 /* Sync CSR data to device if not yet */
67 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
68 {
69   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
70 
71   PetscFunctionBegin;
72   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
73   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
74   if (aijkok->a_dual.need_sync_device()) {
75     aijkok->a_dual.sync_device();
76     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
77     aijkok->hermitian_updated = PETSC_FALSE;
78   }
79   PetscFunctionReturn(PETSC_SUCCESS);
80 }
81 
82 /* Mark the CSR data on device as modified */
83 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
84 {
85   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
86 
87   PetscFunctionBegin;
88   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
89   aijkok->a_dual.clear_sync_state();
90   aijkok->a_dual.modify_device();
91   aijkok->transpose_updated = PETSC_FALSE;
92   aijkok->hermitian_updated = PETSC_FALSE;
93   PetscCall(MatSeqAIJInvalidateDiagonal(A));
94   PetscCall(PetscObjectStateIncrease((PetscObject)A));
95   PetscFunctionReturn(PETSC_SUCCESS);
96 }
97 
98 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
99 {
100   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
101   auto             &exec   = PetscGetKokkosExecutionSpace();
102 
103   PetscFunctionBegin;
104   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
105   /* We do not expect one needs factors on host  */
106   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
107   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
108   PetscCallCXX(aijkok->a_dual.sync_host(exec));
109   PetscCallCXX(exec.fence());
110   PetscFunctionReturn(PETSC_SUCCESS);
111 }
112 
113 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
114 {
115   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
116 
117   PetscFunctionBegin;
118   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
119     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
120     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
121     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
122   */
123   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
124     auto &exec = PetscGetKokkosExecutionSpace();
125     PetscCallCXX(aijkok->a_dual.sync_host(exec));
126     PetscCallCXX(exec.fence());
127     *array = aijkok->a_dual.view_host().data();
128   } else { /* Happens when calling MatSetValues on a newly created matrix */
129     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
130   }
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
135 {
136   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
137 
138   PetscFunctionBegin;
139   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
140   PetscFunctionReturn(PETSC_SUCCESS);
141 }
142 
143 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
144 {
145   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
146 
147   PetscFunctionBegin;
148   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
149     auto &exec = PetscGetKokkosExecutionSpace();
150     PetscCallCXX(aijkok->a_dual.sync_host(exec));
151     PetscCallCXX(exec.fence());
152     *array = aijkok->a_dual.view_host().data();
153   } else {
154     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
155   }
156   PetscFunctionReturn(PETSC_SUCCESS);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
160 {
161   PetscFunctionBegin;
162   *array = NULL;
163   PetscFunctionReturn(PETSC_SUCCESS);
164 }
165 
166 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
167 {
168   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
169 
170   PetscFunctionBegin;
171   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
172     *array = aijkok->a_dual.view_host().data();
173   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
174     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
175   }
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
180 {
181   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
182 
183   PetscFunctionBegin;
184   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
185     aijkok->a_dual.clear_sync_state();
186     aijkok->a_dual.modify_host();
187   }
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
192 {
193   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
194 
195   PetscFunctionBegin;
196   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
197 
198   if (i) *i = aijkok->i_device_data();
199   if (j) *j = aijkok->j_device_data();
200   if (a) {
201     aijkok->a_dual.sync_device();
202     *a = aijkok->a_device_data();
203   }
204   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
205   PetscFunctionReturn(PETSC_SUCCESS);
206 }
207 
208 /*
209   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
210 
211   Input Parameter:
212 .  A       - the MATSEQAIJKOKKOS matrix
213 
214   Output Parameters:
215 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
216 -  T_d    - the transpose on device, whose value array is allocated but not initialized
217 */
218 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
219 {
220   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
221   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
222   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
223   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
224   MatRowMapType          *Ti = Ti_h.data();
225   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
226   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
227   PetscInt               *Tj   = Tj_h.data();
228   PetscInt               *perm = perm_h.data();
229   PetscInt               *offset;
230 
231   PetscFunctionBegin;
232   // Populate Ti
233   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
234   Ti++;
235   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
236   Ti--;
237   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
238 
239   // Populate Tj and the permutation array
240   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
241   for (PetscInt i = 0; i < m; i++) {
242     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
243       PetscInt r    = Aj[j];                       // row r of T
244       PetscInt disp = Ti[r] + offset[r];
245 
246       Tj[disp]   = i; // col i of T
247       perm[disp] = j;
248       offset[r]++;
249     }
250   }
251   PetscCall(PetscFree(offset));
252 
253   // Sort each row of T, along with the permutation array
254   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
255 
256   // Output perm and T on device
257   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
258   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
259   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
260   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
261   PetscFunctionReturn(PETSC_SUCCESS);
262 }
263 
264 // Generate the transpose on device and cache it internally
265 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
266 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
267 {
268   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
269   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
270   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
271   KokkosCsrMatrix  &T = akok->csrmatT;
272 
273   PetscFunctionBegin;
274   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
275   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
276 
277   const auto &Aa = akok->a_dual.view_device();
278 
279   if (A->symmetric == PETSC_BOOL3_TRUE) {
280     *csrmatT = akok->csrmat;
281   } else {
282     // See if we already have a cached transpose and its value is up to date
283     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
284       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
285         const auto &perm = akok->transpose_perm; // get the permutation array
286         auto       &Ta   = T.values;
287 
288         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
289       }
290     } else { // Generate T of size n x m for the first time
291       MatRowMapKokkosView perm;
292 
293       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
294       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
295       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
296     }
297     akok->transpose_updated = PETSC_TRUE;
298     *csrmatT                = akok->csrmatT;
299   }
300   PetscFunctionReturn(PETSC_SUCCESS);
301 }
302 
303 // Generate the Hermitian on device and cache it internally
304 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
305 {
306   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
307   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
308   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
309   KokkosCsrMatrix  &T = akok->csrmatH;
310 
311   PetscFunctionBegin;
312   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
313   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
314 
315   const auto &Aa = akok->a_dual.view_device();
316 
317   if (A->hermitian == PETSC_BOOL3_TRUE) {
318     *csrmatH = akok->csrmat;
319   } else {
320     // See if we already have a cached hermitian and its value is up to date
321     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
322       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
323         const auto &perm = akok->transpose_perm; // get the permutation array
324         auto       &Ta   = T.values;
325 
326         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
327       }
328     } else { // Generate T of size n x m for the first time
329       MatRowMapKokkosView perm;
330 
331       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
332       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
333       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
334     }
335     akok->hermitian_updated = PETSC_TRUE;
336     *csrmatH                = akok->csrmatH;
337   }
338   PetscFunctionReturn(PETSC_SUCCESS);
339 }
340 
341 /* y = A x */
342 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
343 {
344   Mat_SeqAIJKokkos          *aijkok;
345   ConstPetscScalarKokkosView xv;
346   PetscScalarKokkosView      yv;
347 
348   PetscFunctionBegin;
349   PetscCall(PetscLogGpuTimeBegin());
350   PetscCall(MatSeqAIJKokkosSyncDevice(A));
351   PetscCall(VecGetKokkosView(xx, &xv));
352   PetscCall(VecGetKokkosViewWrite(yy, &yv));
353   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
354   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
355   PetscCall(VecRestoreKokkosView(xx, &xv));
356   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
357   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
358   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
359   PetscCall(PetscLogGpuTimeEnd());
360   PetscFunctionReturn(PETSC_SUCCESS);
361 }
362 
363 /* y = A^T x */
364 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
365 {
366   Mat_SeqAIJKokkos          *aijkok;
367   const char                *mode;
368   ConstPetscScalarKokkosView xv;
369   PetscScalarKokkosView      yv;
370   KokkosCsrMatrix            csrmat;
371 
372   PetscFunctionBegin;
373   PetscCall(PetscLogGpuTimeBegin());
374   PetscCall(MatSeqAIJKokkosSyncDevice(A));
375   PetscCall(VecGetKokkosView(xx, &xv));
376   PetscCall(VecGetKokkosViewWrite(yy, &yv));
377   if (A->form_explicit_transpose) {
378     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
379     mode = "N";
380   } else {
381     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
382     csrmat = aijkok->csrmat;
383     mode   = "T";
384   }
385   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
386   PetscCall(VecRestoreKokkosView(xx, &xv));
387   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
388   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
389   PetscCall(PetscLogGpuTimeEnd());
390   PetscFunctionReturn(PETSC_SUCCESS);
391 }
392 
393 /* y = A^H x */
394 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
395 {
396   Mat_SeqAIJKokkos          *aijkok;
397   const char                *mode;
398   ConstPetscScalarKokkosView xv;
399   PetscScalarKokkosView      yv;
400   KokkosCsrMatrix            csrmat;
401 
402   PetscFunctionBegin;
403   PetscCall(PetscLogGpuTimeBegin());
404   PetscCall(MatSeqAIJKokkosSyncDevice(A));
405   PetscCall(VecGetKokkosView(xx, &xv));
406   PetscCall(VecGetKokkosViewWrite(yy, &yv));
407   if (A->form_explicit_transpose) {
408     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
409     mode = "N";
410   } else {
411     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
412     csrmat = aijkok->csrmat;
413     mode   = "C";
414   }
415   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
416   PetscCall(VecRestoreKokkosView(xx, &xv));
417   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
418   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
419   PetscCall(PetscLogGpuTimeEnd());
420   PetscFunctionReturn(PETSC_SUCCESS);
421 }
422 
423 /* z = A x + y */
424 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
425 {
426   Mat_SeqAIJKokkos          *aijkok;
427   ConstPetscScalarKokkosView xv;
428   PetscScalarKokkosView      zv;
429 
430   PetscFunctionBegin;
431   PetscCall(PetscLogGpuTimeBegin());
432   PetscCall(MatSeqAIJKokkosSyncDevice(A));
433   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
434   PetscCall(VecGetKokkosView(xx, &xv));
435   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
436   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
437   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
438   PetscCall(VecRestoreKokkosView(xx, &xv));
439   PetscCall(VecRestoreKokkosView(zz, &zv));
440   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
441   PetscCall(PetscLogGpuTimeEnd());
442   PetscFunctionReturn(PETSC_SUCCESS);
443 }
444 
445 /* z = A^T x + y */
446 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
447 {
448   Mat_SeqAIJKokkos          *aijkok;
449   const char                *mode;
450   ConstPetscScalarKokkosView xv;
451   PetscScalarKokkosView      zv;
452   KokkosCsrMatrix            csrmat;
453 
454   PetscFunctionBegin;
455   PetscCall(PetscLogGpuTimeBegin());
456   PetscCall(MatSeqAIJKokkosSyncDevice(A));
457   if (zz != yy) PetscCall(VecCopy(yy, zz));
458   PetscCall(VecGetKokkosView(xx, &xv));
459   PetscCall(VecGetKokkosView(zz, &zv));
460   if (A->form_explicit_transpose) {
461     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
462     mode = "N";
463   } else {
464     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
465     csrmat = aijkok->csrmat;
466     mode   = "T";
467   }
468   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
469   PetscCall(VecRestoreKokkosView(xx, &xv));
470   PetscCall(VecRestoreKokkosView(zz, &zv));
471   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
472   PetscCall(PetscLogGpuTimeEnd());
473   PetscFunctionReturn(PETSC_SUCCESS);
474 }
475 
476 /* z = A^H x + y */
477 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
478 {
479   Mat_SeqAIJKokkos          *aijkok;
480   const char                *mode;
481   ConstPetscScalarKokkosView xv;
482   PetscScalarKokkosView      zv;
483   KokkosCsrMatrix            csrmat;
484 
485   PetscFunctionBegin;
486   PetscCall(PetscLogGpuTimeBegin());
487   PetscCall(MatSeqAIJKokkosSyncDevice(A));
488   if (zz != yy) PetscCall(VecCopy(yy, zz));
489   PetscCall(VecGetKokkosView(xx, &xv));
490   PetscCall(VecGetKokkosView(zz, &zv));
491   if (A->form_explicit_transpose) {
492     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
493     mode = "N";
494   } else {
495     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
496     csrmat = aijkok->csrmat;
497     mode   = "C";
498   }
499   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
500   PetscCall(VecRestoreKokkosView(xx, &xv));
501   PetscCall(VecRestoreKokkosView(zz, &zv));
502   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
503   PetscCall(PetscLogGpuTimeEnd());
504   PetscFunctionReturn(PETSC_SUCCESS);
505 }
506 
507 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
508 {
509   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
510 
511   PetscFunctionBegin;
512   switch (op) {
513   case MAT_FORM_EXPLICIT_TRANSPOSE:
514     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
515     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
516     A->form_explicit_transpose = flg;
517     break;
518   default:
519     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
520     break;
521   }
522   PetscFunctionReturn(PETSC_SUCCESS);
523 }
524 
525 /* Depending on reuse, either build a new mat, or use the existing mat */
526 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
527 {
528   Mat_SeqAIJ *aseq;
529 
530   PetscFunctionBegin;
531   PetscCall(PetscKokkosInitializeCheck());
532   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
533     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
534   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
535     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
536   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
537     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
538     PetscCall(PetscFree(A->defaultvectype));
539     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
540     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
541     PetscCall(MatSetOps_SeqAIJKokkos(A));
542     aseq = static_cast<Mat_SeqAIJ *>(A->data);
543     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
544       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
545       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
546     }
547   }
548   PetscFunctionReturn(PETSC_SUCCESS);
549 }
550 
551 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
552    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
553  */
554 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
555 {
556   Mat_SeqAIJ       *bseq;
557   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
558   Mat               mat;
559 
560   PetscFunctionBegin;
561   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
562   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
563   mat = *B;
564   if (A->assembled) {
565     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
566     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
567     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
568     /* Now copy values to B if needed */
569     if (dupOption == MAT_COPY_VALUES) {
570       if (akok->a_dual.need_sync_device()) {
571         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
572         bkok->a_dual.modify_host();
573       } else { /* If device has the latest data, we only copy data on device */
574         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
575         bkok->a_dual.modify_device();
576       }
577     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
578       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
579       bkok->a_dual.modify_host();
580     }
581     mat->spptr = bkok;
582   }
583 
584   PetscCall(PetscFree(mat->defaultvectype));
585   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
586   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
587   PetscCall(MatSetOps_SeqAIJKokkos(mat));
588   PetscFunctionReturn(PETSC_SUCCESS);
589 }
590 
591 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
592 {
593   Mat               At;
594   KokkosCsrMatrix   internT;
595   Mat_SeqAIJKokkos *atkok, *bkok;
596 
597   PetscFunctionBegin;
598   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
599   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
600   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
601     /* Deep copy internT, as we want to isolate the internal transpose */
602     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
603     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
604     if (reuse == MAT_INITIAL_MATRIX) *B = At;
605     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
606   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
607     if ((*B)->assembled) {
608       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
609       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
610       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
611     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
612       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
613       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
614       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
615       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
616       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
617     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
618   }
619   PetscFunctionReturn(PETSC_SUCCESS);
620 }
621 
622 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
623 {
624   Mat_SeqAIJKokkos *aijkok;
625 
626   PetscFunctionBegin;
627   if (A->factortype == MAT_FACTOR_NONE) {
628     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
629     delete aijkok;
630   } else {
631     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
632   }
633   A->spptr = NULL;
634   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
635   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
636   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
637   PetscCall(MatDestroy_SeqAIJ(A));
638   PetscFunctionReturn(PETSC_SUCCESS);
639 }
640 
641 /*MC
642    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
643 
644    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
645 
646    Options Database Key:
647 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
648 
649   Level: beginner
650 
651 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
652 M*/
653 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
654 {
655   PetscFunctionBegin;
656   PetscCall(PetscKokkosInitializeCheck());
657   PetscCall(MatCreate_SeqAIJ(A));
658   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
659   PetscFunctionReturn(PETSC_SUCCESS);
660 }
661 
662 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
663 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
664 {
665   Mat_SeqAIJ         *a, *b;
666   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
667   MatScalarKokkosView aa, ba, ca;
668   MatRowMapKokkosView ai, bi, ci;
669   MatColIdxKokkosView aj, bj, cj;
670   PetscInt            m, n, nnz, aN;
671 
672   PetscFunctionBegin;
673   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
674   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
675   PetscAssertPointer(C, 4);
676   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
677   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
678   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
679   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
680 
681   PetscCall(MatSeqAIJKokkosSyncDevice(A));
682   PetscCall(MatSeqAIJKokkosSyncDevice(B));
683   a    = static_cast<Mat_SeqAIJ *>(A->data);
684   b    = static_cast<Mat_SeqAIJ *>(B->data);
685   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
686   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
687   aa   = akok->a_dual.view_device();
688   ai   = akok->i_dual.view_device();
689   ba   = bkok->a_dual.view_device();
690   bi   = bkok->i_dual.view_device();
691   m    = A->rmap->n; /* M, N and nnz of C */
692   n    = A->cmap->n + B->cmap->n;
693   nnz  = a->nz + b->nz;
694   aN   = A->cmap->n; /* N of A */
695   if (reuse == MAT_INITIAL_MATRIX) {
696     aj           = akok->j_dual.view_device();
697     bj           = bkok->j_dual.view_device();
698     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
699     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
700     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
701     ca           = ca_dual.view_device();
702     ci           = ci_dual.view_device();
703     cj           = cj_dual.view_device();
704 
705     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
706     Kokkos::parallel_for(
707       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
708         PetscInt i       = t.league_rank(); /* row i */
709         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
710 
711         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
712                                                    ci(i) = coffset;
713                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
714         });
715 
716         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
717           if (k < alen) {
718             ca(coffset + k) = aa(ai(i) + k);
719             cj(coffset + k) = aj(ai(i) + k);
720           } else {
721             ca(coffset + k) = ba(bi(i) + k - alen);
722             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
723           }
724         });
725       });
726     ca_dual.modify_device();
727     ci_dual.modify_device();
728     cj_dual.modify_device();
729     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
730     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
731   } else if (reuse == MAT_REUSE_MATRIX) {
732     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
733     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
734     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
735     ca   = ckok->a_dual.view_device();
736     ci   = ckok->i_dual.view_device();
737 
738     Kokkos::parallel_for(
739       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
740         PetscInt i    = t.league_rank(); /* row i */
741         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
742         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
743           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
744           else ca(ci(i) + k) = ba(bi(i) + k - alen);
745         });
746       });
747     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
748   }
749   PetscFunctionReturn(PETSC_SUCCESS);
750 }
751 
752 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
753 {
754   PetscFunctionBegin;
755   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
756   PetscFunctionReturn(PETSC_SUCCESS);
757 }
758 
759 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
760 {
761   Mat_Product                 *product = C->product;
762   Mat                          A, B;
763   bool                         transA, transB; /* use bool, since KK needs this type */
764   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
765   Mat_SeqAIJ                  *c;
766   MatProductData_SeqAIJKokkos *pdata;
767   KokkosCsrMatrix              csrmatA, csrmatB;
768 
769   PetscFunctionBegin;
770   MatCheckProduct(C, 1);
771   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
772   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
773 
774   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
775   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
776   // we still do numeric.
777   if (pdata->reusesym) { // numeric reuses results from symbolic
778     pdata->reusesym = PETSC_FALSE;
779     PetscFunctionReturn(PETSC_SUCCESS);
780   }
781 
782   switch (product->type) {
783   case MATPRODUCT_AB:
784     transA = false;
785     transB = false;
786     break;
787   case MATPRODUCT_AtB:
788     transA = true;
789     transB = false;
790     break;
791   case MATPRODUCT_ABt:
792     transA = false;
793     transB = true;
794     break;
795   default:
796     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
797   }
798 
799   A = product->A;
800   B = product->B;
801   PetscCall(MatSeqAIJKokkosSyncDevice(A));
802   PetscCall(MatSeqAIJKokkosSyncDevice(B));
803   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
804   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
805   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
806 
807   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
808 
809   csrmatA = akok->csrmat;
810   csrmatB = bkok->csrmat;
811 
812   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
813   if (transA) {
814     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
815     transA = false;
816   }
817 
818   if (transB) {
819     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
820     transB = false;
821   }
822   PetscCall(PetscLogGpuTimeBegin());
823   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
824 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
825   auto spgemmHandle = pdata->kh.get_spgemm_handle();
826   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
827 #endif
828 
829   PetscCall(PetscLogGpuTimeEnd());
830   PetscCall(MatSeqAIJKokkosModifyDevice(C));
831   /* shorter version of MatAssemblyEnd_SeqAIJ */
832   c = (Mat_SeqAIJ *)C->data;
833   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
834   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
835   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
836   c->reallocs         = 0;
837   C->info.mallocs     = 0;
838   C->info.nz_unneeded = 0;
839   C->assembled = C->was_assembled = PETSC_TRUE;
840   C->num_ass++;
841   PetscFunctionReturn(PETSC_SUCCESS);
842 }
843 
844 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
845 {
846   Mat_Product                 *product = C->product;
847   MatProductType               ptype;
848   Mat                          A, B;
849   bool                         transA, transB;
850   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
851   MatProductData_SeqAIJKokkos *pdata;
852   MPI_Comm                     comm;
853   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
854 
855   PetscFunctionBegin;
856   MatCheckProduct(C, 1);
857   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
858   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
859   A = product->A;
860   B = product->B;
861   PetscCall(MatSeqAIJKokkosSyncDevice(A));
862   PetscCall(MatSeqAIJKokkosSyncDevice(B));
863   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
864   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
865   csrmatA = akok->csrmat;
866   csrmatB = bkok->csrmat;
867 
868   ptype = product->type;
869   // Take advantage of the symmetry if true
870   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
871     ptype                                          = MATPRODUCT_AB;
872     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
873   }
874   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
875     ptype                                          = MATPRODUCT_AB;
876     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
877   }
878 
879   switch (ptype) {
880   case MATPRODUCT_AB:
881     transA = false;
882     transB = false;
883     break;
884   case MATPRODUCT_AtB:
885     transA = true;
886     transB = false;
887     break;
888   case MATPRODUCT_ABt:
889     transA = false;
890     transB = true;
891     break;
892   default:
893     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
894   }
895   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
896   pdata->reusesym = product->api_user;
897 
898   /* TODO: add command line options to select spgemm algorithms */
899   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
900 
901   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
902 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
903   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
904   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
905   #endif
906 #endif
907   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
908 
909   PetscCall(PetscLogGpuTimeBegin());
910   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
911   if (transA) {
912     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
913     transA = false;
914   }
915 
916   if (transB) {
917     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
918     transB = false;
919   }
920 
921   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
922   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
923     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
924     calling new Mat_SeqAIJKokkos().
925     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
926   */
927   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
928 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
929   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
930   auto spgemmHandle = pdata->kh.get_spgemm_handle();
931   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
932 #endif
933   PetscCall(PetscLogGpuTimeEnd());
934 
935   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
936   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
937   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
938   PetscFunctionReturn(PETSC_SUCCESS);
939 }
940 
941 /* handles sparse matrix matrix ops */
942 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
943 {
944   Mat_Product *product = mat->product;
945   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
946 
947   PetscFunctionBegin;
948   MatCheckProduct(mat, 1);
949   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
950   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
951   if (Biskok && Ciskok) {
952     switch (product->type) {
953     case MATPRODUCT_AB:
954     case MATPRODUCT_AtB:
955     case MATPRODUCT_ABt:
956       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
957       break;
958     case MATPRODUCT_PtAP:
959     case MATPRODUCT_RARt:
960     case MATPRODUCT_ABC:
961       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
962       break;
963     default:
964       break;
965     }
966   } else { /* fallback for AIJ */
967     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
968   }
969   PetscFunctionReturn(PETSC_SUCCESS);
970 }
971 
972 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
973 {
974   Mat_SeqAIJKokkos *aijkok;
975 
976   PetscFunctionBegin;
977   PetscCall(PetscLogGpuTimeBegin());
978   PetscCall(MatSeqAIJKokkosSyncDevice(A));
979   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
980   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
981   PetscCall(MatSeqAIJKokkosModifyDevice(A));
982   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
983   PetscCall(PetscLogGpuTimeEnd());
984   PetscFunctionReturn(PETSC_SUCCESS);
985 }
986 
987 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
988 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
989 {
990   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
991 
992   PetscFunctionBegin;
993   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
994     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
995 
996     PetscCall(PetscLogGpuTimeBegin());
997     PetscCall(MatSeqAIJKokkosSyncDevice(A));
998     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
999     const auto &Aa     = aijkok->a_dual.view_device();
1000     const auto &Adiag  = aijkok->diag_dual.view_device();
1001     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1002     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1003     PetscCall(PetscLogGpuFlops(n));
1004     PetscCall(PetscLogGpuTimeEnd());
1005   } else { // need reassembly, very slow!
1006     PetscCall(MatShift_Basic(A, a));
1007   }
1008   PetscFunctionReturn(PETSC_SUCCESS);
1009 }
1010 
1011 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1012 {
1013   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1014 
1015   PetscFunctionBegin;
1016   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1017     ConstPetscScalarKokkosView dv;
1018     PetscInt                   n, nv;
1019 
1020     PetscCall(PetscLogGpuTimeBegin());
1021     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1022     PetscCall(VecGetKokkosView(D, &dv));
1023     PetscCall(VecGetLocalSize(D, &nv));
1024     n = PetscMin(Y->rmap->n, Y->cmap->n);
1025     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1026 
1027     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1028     const auto &Aa     = aijkok->a_dual.view_device();
1029     const auto &Adiag  = aijkok->diag_dual.view_device();
1030     PetscCallCXX(Kokkos::parallel_for(
1031       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1032         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1033         else Aa(Adiag(i)) += dv(i);
1034       }));
1035     PetscCall(VecRestoreKokkosView(D, &dv));
1036     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1037     PetscCall(PetscLogGpuFlops(n));
1038     PetscCall(PetscLogGpuTimeEnd());
1039   } else { // need reassembly, very slow!
1040     PetscCall(MatDiagonalSet_Default(Y, D, is));
1041   }
1042   PetscFunctionReturn(PETSC_SUCCESS);
1043 }
1044 
1045 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1046 {
1047   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1048   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1049   ConstPetscScalarKokkosView lv, rv;
1050 
1051   PetscFunctionBegin;
1052   PetscCall(PetscLogGpuTimeBegin());
1053   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1054   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1055   const auto &Aa     = aijkok->a_dual.view_device();
1056   const auto &Ai     = aijkok->i_dual.view_device();
1057   const auto &Aj     = aijkok->j_dual.view_device();
1058   if (ll) {
1059     PetscCall(VecGetLocalSize(ll, &m));
1060     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1061     PetscCall(VecGetKokkosView(ll, &lv));
1062     PetscCallCXX(Kokkos::parallel_for( // for each row
1063       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1064         PetscInt i   = t.league_rank(); // row i
1065         PetscInt len = Ai(i + 1) - Ai(i);
1066         // scale entries on the row
1067         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1068       }));
1069     PetscCall(VecRestoreKokkosView(ll, &lv));
1070     PetscCall(PetscLogGpuFlops(nz));
1071   }
1072   if (rr) {
1073     PetscCall(VecGetLocalSize(rr, &n));
1074     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1075     PetscCall(VecGetKokkosView(rr, &rv));
1076     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1077       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1078     PetscCall(VecRestoreKokkosView(rr, &lv));
1079     PetscCall(PetscLogGpuFlops(nz));
1080   }
1081   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1082   PetscCall(PetscLogGpuTimeEnd());
1083   PetscFunctionReturn(PETSC_SUCCESS);
1084 }
1085 
1086 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1087 {
1088   Mat_SeqAIJKokkos *aijkok;
1089 
1090   PetscFunctionBegin;
1091   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1092   if (aijkok) { /* Only zero the device if data is already there */
1093     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1094     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1095   } else { /* Might be preallocated but not assembled */
1096     PetscCall(MatZeroEntries_SeqAIJ(A));
1097   }
1098   PetscFunctionReturn(PETSC_SUCCESS);
1099 }
1100 
1101 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1102 {
1103   Mat_SeqAIJKokkos     *aijkok;
1104   PetscInt              n;
1105   PetscScalarKokkosView xv;
1106 
1107   PetscFunctionBegin;
1108   PetscCall(VecGetLocalSize(x, &n));
1109   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1110   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1111 
1112   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1113   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1114 
1115   const auto &Aa    = aijkok->a_dual.view_device();
1116   const auto &Ai    = aijkok->i_dual.view_device();
1117   const auto &Adiag = aijkok->diag_dual.view_device();
1118 
1119   PetscCall(VecGetKokkosViewWrite(x, &xv));
1120   Kokkos::parallel_for(
1121     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1122       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1123       else xv(i) = 0;
1124     });
1125   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1126   PetscFunctionReturn(PETSC_SUCCESS);
1127 }
1128 
1129 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1130 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1131 {
1132   Mat_SeqAIJKokkos *aijkok;
1133 
1134   PetscFunctionBegin;
1135   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1136   PetscAssertPointer(kv, 2);
1137   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1138   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1139   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1140   *kv    = aijkok->a_dual.view_device();
1141   PetscFunctionReturn(PETSC_SUCCESS);
1142 }
1143 
1144 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1145 {
1146   PetscFunctionBegin;
1147   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1148   PetscAssertPointer(kv, 2);
1149   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1150   PetscFunctionReturn(PETSC_SUCCESS);
1151 }
1152 
1153 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1154 {
1155   Mat_SeqAIJKokkos *aijkok;
1156 
1157   PetscFunctionBegin;
1158   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1159   PetscAssertPointer(kv, 2);
1160   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1161   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1162   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1163   *kv    = aijkok->a_dual.view_device();
1164   PetscFunctionReturn(PETSC_SUCCESS);
1165 }
1166 
1167 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1168 {
1169   PetscFunctionBegin;
1170   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1171   PetscAssertPointer(kv, 2);
1172   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1173   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1174   PetscFunctionReturn(PETSC_SUCCESS);
1175 }
1176 
1177 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1178 {
1179   Mat_SeqAIJKokkos *aijkok;
1180 
1181   PetscFunctionBegin;
1182   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1183   PetscAssertPointer(kv, 2);
1184   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1185   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1186   *kv    = aijkok->a_dual.view_device();
1187   PetscFunctionReturn(PETSC_SUCCESS);
1188 }
1189 
1190 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1191 {
1192   PetscFunctionBegin;
1193   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1194   PetscAssertPointer(kv, 2);
1195   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1196   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1197   PetscFunctionReturn(PETSC_SUCCESS);
1198 }
1199 
1200 /* Computes Y += alpha X */
1201 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1202 {
1203   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1204   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1205   ConstMatScalarKokkosView Xa;
1206   MatScalarKokkosView      Ya;
1207   auto                    &exec = PetscGetKokkosExecutionSpace();
1208 
1209   PetscFunctionBegin;
1210   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1211   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1212   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1213   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1214   PetscCall(PetscLogGpuTimeBegin());
1215 
1216   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1217     PetscBool e;
1218     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1219     if (e) {
1220       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1221       if (e) pattern = SAME_NONZERO_PATTERN;
1222     }
1223   }
1224 
1225   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1226     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1227     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1228     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1229   */
1230   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1231   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1232   Xa   = xkok->a_dual.view_device();
1233   Ya   = ykok->a_dual.view_device();
1234 
1235   if (pattern == SAME_NONZERO_PATTERN) {
1236     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1237     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1238   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1239     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1240     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1241 
1242     Kokkos::parallel_for(
1243       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1244         PetscInt i = t.league_rank(); // row i
1245         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1246           // Only one thread works in a team
1247           PetscInt p, q = Yi(i);
1248           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1249             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1250             if (Xj(p) == Yj(q)) {                        // Found it
1251               Ya(q) += alpha * Xa(p);
1252               q++;
1253             } else {
1254             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1255             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1256 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1257               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1258 #else
1259               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1260 #endif
1261             }
1262           }
1263         });
1264       });
1265     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1266   } else { // different nonzero patterns
1267     Mat             Z;
1268     KokkosCsrMatrix zcsr;
1269     KernelHandle    kh;
1270     kh.create_spadd_handle(true); // X, Y are sorted
1271     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1272     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1273     zkok = new Mat_SeqAIJKokkos(zcsr);
1274     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1275     PetscCall(MatHeaderReplace(Y, &Z));
1276     kh.destroy_spadd_handle();
1277   }
1278   PetscCall(PetscLogGpuTimeEnd());
1279   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1280   PetscFunctionReturn(PETSC_SUCCESS);
1281 }
1282 
1283 struct MatCOOStruct_SeqAIJKokkos {
1284   PetscCount           n;
1285   PetscCount           Atot;
1286   PetscInt             nz;
1287   PetscCountKokkosView jmap;
1288   PetscCountKokkosView perm;
1289 
1290   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1291   {
1292     nz   = coo_h->nz;
1293     n    = coo_h->n;
1294     Atot = coo_h->Atot;
1295     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1296     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1297   }
1298 };
1299 
1300 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
1301 {
1302   PetscFunctionBegin;
1303   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
1304   PetscFunctionReturn(PETSC_SUCCESS);
1305 }
1306 
1307 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1308 {
1309   Mat_SeqAIJKokkos          *akok;
1310   Mat_SeqAIJ                *aseq;
1311   PetscContainer             container_h;
1312   MatCOOStruct_SeqAIJ       *coo_h;
1313   MatCOOStruct_SeqAIJKokkos *coo_d;
1314 
1315   PetscFunctionBegin;
1316   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1317   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1318   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1319   delete akok;
1320   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1321   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1322 
1323   // Copy the COO struct to device
1324   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1325   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1326   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1327 
1328   // Put the COO struct in a container and then attach that to the matrix
1329   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1330   PetscFunctionReturn(PETSC_SUCCESS);
1331 }
1332 
1333 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1334 {
1335   MatScalarKokkosView        Aa;
1336   ConstMatScalarKokkosView   kv;
1337   PetscMemType               memtype;
1338   PetscContainer             container;
1339   MatCOOStruct_SeqAIJKokkos *coo;
1340 
1341   PetscFunctionBegin;
1342   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1343   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1344 
1345   const auto &n    = coo->n;
1346   const auto &Annz = coo->nz;
1347   const auto &jmap = coo->jmap;
1348   const auto &perm = coo->perm;
1349 
1350   PetscCall(PetscGetMemType(v, &memtype));
1351   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1352     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1353   } else {
1354     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1355   }
1356 
1357   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1358   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1359 
1360   PetscCall(PetscLogGpuTimeBegin());
1361   Kokkos::parallel_for(
1362     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1363       PetscScalar sum = 0.0;
1364       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1365       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1366     });
1367   PetscCall(PetscLogGpuTimeEnd());
1368 
1369   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1370   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1371   PetscFunctionReturn(PETSC_SUCCESS);
1372 }
1373 
1374 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1375 {
1376   PetscFunctionBegin;
1377   PetscCall(MatSeqAIJKokkosSyncHost(A));
1378   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1379   B->offloadmask = PETSC_OFFLOAD_CPU;
1380   PetscFunctionReturn(PETSC_SUCCESS);
1381 }
1382 
1383 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1384 {
1385   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1386 
1387   PetscFunctionBegin;
1388   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1389   A->boundtocpu  = PETSC_FALSE;
1390 
1391   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1392   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1393   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1394   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1395   A->ops->scale                     = MatScale_SeqAIJKokkos;
1396   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1397   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1398   A->ops->mult                      = MatMult_SeqAIJKokkos;
1399   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1400   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1401   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1402   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1403   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1404   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1405   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1406   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1407   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1408   A->ops->shift                     = MatShift_SeqAIJKokkos;
1409   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1410   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1411   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1412   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1413   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1414   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1415   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1416   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1417   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1418 
1419   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1420   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1421   PetscFunctionReturn(PETSC_SUCCESS);
1422 }
1423 
1424 /*
1425    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1426 
1427   Input Parameters:
1428 +  A       - the MATSEQAIJKOKKOS matrix
1429 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1430 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1431 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1432 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1433 
1434   Output Parameter:
1435 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1436 */
1437 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1438 {
1439   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1440   PetscInt          nblocks = bs.extent(0) - 1;
1441 
1442   PetscFunctionBegin;
1443   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1444 
1445   // Pull out the diagonal blocks of the matrix and then invert the blocks
1446   auto Aa    = akok->a_dual.view_device();
1447   auto Ai    = akok->i_dual.view_device();
1448   auto Aj    = akok->j_dual.view_device();
1449   auto Adiag = akok->diag_dual.view_device();
1450   // TODO: how to tune the team size?
1451 #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST)
1452   auto ts = Kokkos::AUTO();
1453 #else
1454   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1455 #endif
1456   PetscCallCXX(Kokkos::parallel_for(
1457     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1458       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1459       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1460       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1461       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1462       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1463 
1464       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1465         PetscInt i = rstart + r;                                                            // i-th row in A
1466 
1467         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1468           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1469 
1470           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1471             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1472               B(r, c) = 0.0;
1473             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1474               B(r, c) = Aa(first + c);
1475             } else { // this entry does not show up in the CSR
1476               B(r, c) = 0.0;
1477             }
1478           }
1479         } else { // rare case that the diagonal does not exist
1480           const PetscInt begin = Ai(i);
1481           const PetscInt end   = Ai(i + 1);
1482           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1483           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1484             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1485             else if (Aj(j) >= rstart + m) break;
1486           }
1487         }
1488       });
1489 
1490       // LU-decompose B (w/o pivoting) and then invert B
1491       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1492       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1493     }));
1494   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1495   PetscFunctionReturn(PETSC_SUCCESS);
1496 }
1497 
1498 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1499 {
1500   Mat_SeqAIJ *aseq;
1501   PetscInt    i, m, n;
1502   auto       &exec = PetscGetKokkosExecutionSpace();
1503 
1504   PetscFunctionBegin;
1505   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1506 
1507   m = akok->nrows();
1508   n = akok->ncols();
1509   PetscCall(MatSetSizes(A, m, n, m, n));
1510   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1511 
1512   /* Set up data structures of A as a MATSEQAIJ */
1513   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1514   aseq = (Mat_SeqAIJ *)A->data;
1515 
1516   PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1517   PetscCallCXX(akok->j_dual.sync_host(exec));
1518   PetscCallCXX(exec.fence());
1519 
1520   aseq->i       = akok->i_host_data();
1521   aseq->j       = akok->j_host_data();
1522   aseq->a       = akok->a_host_data();
1523   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1524   aseq->free_a  = PETSC_FALSE;
1525   aseq->free_ij = PETSC_FALSE;
1526   aseq->nz      = akok->nnz();
1527   aseq->maxnz   = aseq->nz;
1528 
1529   PetscCall(PetscMalloc1(m, &aseq->imax));
1530   PetscCall(PetscMalloc1(m, &aseq->ilen));
1531   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1532 
1533   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1534   akok->nonzerostate = A->nonzerostate;
1535   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1536   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1537   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1538   PetscFunctionReturn(PETSC_SUCCESS);
1539 }
1540 
1541 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1542 {
1543   PetscFunctionBegin;
1544   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1545   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1546   PetscFunctionReturn(PETSC_SUCCESS);
1547 }
1548 
1549 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1550 {
1551   Mat_SeqAIJKokkos *akok;
1552 
1553   PetscFunctionBegin;
1554   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1555   PetscCall(MatCreate(comm, A));
1556   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1557   PetscFunctionReturn(PETSC_SUCCESS);
1558 }
1559 
1560 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1561 
1562    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1563  */
1564 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1565 {
1566   PetscFunctionBegin;
1567   PetscCall(MatCreate(comm, A));
1568   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1569   PetscFunctionReturn(PETSC_SUCCESS);
1570 }
1571 
1572 /*@C
1573   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1574   (the default parallel PETSc format). This matrix will ultimately be handled by
1575   Kokkos for calculations.
1576 
1577   Collective
1578 
1579   Input Parameters:
1580 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1581 . m    - number of rows
1582 . n    - number of columns
1583 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1584 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1585 
1586   Output Parameter:
1587 . A - the matrix
1588 
1589   Level: intermediate
1590 
1591   Notes:
1592   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1593   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1594   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1595 
1596   The AIJ format, also called
1597   compressed row storage, is fully compatible with standard Fortran
1598   storage.  That is, the stored row and column indices can begin at
1599   either one (as in Fortran) or zero.
1600 
1601   Specify the preallocated storage with either `nz` or `nnz` (not both).
1602   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1603   allocation.
1604 
1605 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1606 @*/
1607 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1608 {
1609   PetscFunctionBegin;
1610   PetscCall(PetscKokkosInitializeCheck());
1611   PetscCall(MatCreate(comm, A));
1612   PetscCall(MatSetSizes(*A, m, n, m, n));
1613   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1614   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1615   PetscFunctionReturn(PETSC_SUCCESS);
1616 }
1617 
1618 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1619 {
1620   PetscFunctionBegin;
1621   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1622   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1623   PetscFunctionReturn(PETSC_SUCCESS);
1624 }
1625 
1626 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1627 {
1628   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1629 
1630   PetscFunctionBegin;
1631   if (!factors->sptrsv_symbolic_completed) {
1632     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1633     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1634     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1635   }
1636   PetscFunctionReturn(PETSC_SUCCESS);
1637 }
1638 
1639 /* Check if we need to update factors etc for transpose solve */
1640 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1641 {
1642   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1643   MatColIdxType               n       = A->rmap->n;
1644 
1645   PetscFunctionBegin;
1646   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1647     /* Update L^T and do sptrsv symbolic */
1648     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1649     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1650     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1651 
1652     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1653                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1654 
1655     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1656       We have to sort the indices, until KK provides finer control options.
1657     */
1658     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1659 
1660     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1661 
1662     /* Update U^T and do sptrsv symbolic */
1663     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1664     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1665     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1666 
1667     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1668                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1669 
1670     /* Sort indices. See comments above */
1671     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1672 
1673     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1674     factors->transpose_updated = PETSC_TRUE;
1675   }
1676   PetscFunctionReturn(PETSC_SUCCESS);
1677 }
1678 
1679 /* Solve Ax = b, with A = LU */
1680 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1681 {
1682   ConstPetscScalarKokkosView  bv;
1683   PetscScalarKokkosView       xv;
1684   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1685 
1686   PetscFunctionBegin;
1687   PetscCall(PetscLogGpuTimeBegin());
1688   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1689   PetscCall(VecGetKokkosView(b, &bv));
1690   PetscCall(VecGetKokkosViewWrite(x, &xv));
1691   /* Solve L tmpv = b */
1692   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1693   /* Solve Ux = tmpv */
1694   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1695   PetscCall(VecRestoreKokkosView(b, &bv));
1696   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1697   PetscCall(PetscLogGpuTimeEnd());
1698   PetscFunctionReturn(PETSC_SUCCESS);
1699 }
1700 
1701 /* Solve A^T x = b, where A^T = U^T L^T */
1702 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1703 {
1704   ConstPetscScalarKokkosView  bv;
1705   PetscScalarKokkosView       xv;
1706   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1707 
1708   PetscFunctionBegin;
1709   PetscCall(PetscLogGpuTimeBegin());
1710   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1711   PetscCall(VecGetKokkosView(b, &bv));
1712   PetscCall(VecGetKokkosViewWrite(x, &xv));
1713   /* Solve U^T tmpv = b */
1714   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1715 
1716   /* Solve L^T x = tmpv */
1717   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1718   PetscCall(VecRestoreKokkosView(b, &bv));
1719   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1720   PetscCall(PetscLogGpuTimeEnd());
1721   PetscFunctionReturn(PETSC_SUCCESS);
1722 }
1723 
1724 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1725 {
1726   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1727   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1728   PetscInt                    fill_lev = info->levels;
1729 
1730   PetscFunctionBegin;
1731   PetscCall(PetscLogGpuTimeBegin());
1732   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1733 
1734   auto a_d = aijkok->a_dual.view_device();
1735   auto i_d = aijkok->i_dual.view_device();
1736   auto j_d = aijkok->j_dual.view_device();
1737 
1738   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1739 
1740   B->assembled              = PETSC_TRUE;
1741   B->preallocated           = PETSC_TRUE;
1742   B->ops->solve             = MatSolve_SeqAIJKokkos;
1743   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1744   B->ops->matsolve          = NULL;
1745   B->ops->matsolvetranspose = NULL;
1746   B->offloadmask            = PETSC_OFFLOAD_GPU;
1747 
1748   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1749   factors->transpose_updated         = PETSC_FALSE;
1750   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1751   /* TODO: log flops, but how to know that? */
1752   PetscCall(PetscLogGpuTimeEnd());
1753   PetscFunctionReturn(PETSC_SUCCESS);
1754 }
1755 
1756 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1757 {
1758   Mat_SeqAIJKokkos           *aijkok;
1759   Mat_SeqAIJ                 *b;
1760   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1761   PetscInt                    fill_lev = info->levels;
1762   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1763   PetscInt                    n        = A->rmap->n;
1764 
1765   PetscFunctionBegin;
1766   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1767   /* Rebuild factors */
1768   if (factors) {
1769     factors->Destroy();
1770   } /* Destroy the old if it exists */
1771   else {
1772     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1773   }
1774 
1775   /* Create a spiluk handle and then do symbolic factorization */
1776   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1777   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1778 
1779   auto spiluk_handle = factors->kh.get_spiluk_handle();
1780 
1781   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1782   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1783   Kokkos::realloc(factors->iU_d, n + 1);
1784   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1785 
1786   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1787   auto i_d = aijkok->i_dual.view_device();
1788   auto j_d = aijkok->j_dual.view_device();
1789   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1790   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1791 
1792   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1793   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1794   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1795   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1796 
1797   /* TODO: add options to select sptrsv algorithms */
1798   /* Create sptrsv handles for L, U and their transpose */
1799 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1800   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1801 #else
1802   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1803 #endif
1804 
1805   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1806   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1807   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1808   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1809 
1810   /* Fill fields of the factor matrix B */
1811   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1812   b     = (Mat_SeqAIJ *)B->data;
1813   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1814   B->info.fill_ratio_given  = info->fill;
1815   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1816 
1817   B->offloadmask          = PETSC_OFFLOAD_GPU;
1818   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1819   PetscFunctionReturn(PETSC_SUCCESS);
1820 }
1821 
1822 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1823 {
1824   PetscFunctionBegin;
1825   *type = MATSOLVERKOKKOS;
1826   PetscFunctionReturn(PETSC_SUCCESS);
1827 }
1828 
1829 /*MC
1830   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1831   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1832 
1833   Level: beginner
1834 
1835 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1836 M*/
1837 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1838 {
1839   PetscInt n = A->rmap->n;
1840 
1841   PetscFunctionBegin;
1842   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1843   PetscCall(MatSetSizes(*B, n, n, n, n));
1844   (*B)->factortype = ftype;
1845   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1846   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1847 
1848   if (ftype == MAT_FACTOR_LU) {
1849     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1850     (*B)->canuseordering        = PETSC_TRUE;
1851     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1852   } else if (ftype == MAT_FACTOR_ILU) {
1853     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1854     (*B)->canuseordering         = PETSC_FALSE;
1855     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1856   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1857 
1858   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1859   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1860   PetscFunctionReturn(PETSC_SUCCESS);
1861 }
1862 
1863 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1864 {
1865   PetscFunctionBegin;
1866   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1867   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1868   PetscFunctionReturn(PETSC_SUCCESS);
1869 }
1870 
1871 /* Utility to print out a KokkosCsrMatrix for debugging */
1872 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1873 {
1874   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1875   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1876   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1877   const PetscInt    *i  = iv.data();
1878   const PetscInt    *j  = jv.data();
1879   const PetscScalar *a  = av.data();
1880   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1881 
1882   PetscFunctionBegin;
1883   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1884   for (PetscInt k = 0; k < m; k++) {
1885     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1886     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1887     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1888   }
1889   PetscFunctionReturn(PETSC_SUCCESS);
1890 }
1891