xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 095fb05fba1caf9b11dcec8f3d0f42e5f5a166fb)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
18 
19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99)
20   #include <KokkosSparse_Utils.hpp>
21 using KokkosSparse::sort_crs_matrix;
22 using KokkosSparse::Impl::transpose_matrix;
23 #else
24   #include <KokkosKernels_Sorting.hpp>
25 using KokkosKernels::sort_crs_matrix;
26 using KokkosKernels::Impl::transpose_matrix;
27 #endif
28 
29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
30 
31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33    In the latter case, it is important to set a_dual's sync state correctly.
34  */
35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
36 {
37   Mat_SeqAIJ       *aijseq;
38   Mat_SeqAIJKokkos *aijkok;
39 
40   PetscFunctionBegin;
41   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
42   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
43 
44   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
45   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
46 
47   /* If aijkok does not exist, we just copy i, j to device.
48      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
49      In both cases, we build a new aijkok structure.
50   */
51   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
52     delete aijkok;
53     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
54     A->spptr = aijkok;
55   }
56 
57   if (aijkok->device_mat_d.data()) {
58     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
59   }
60   PetscFunctionReturn(PETSC_SUCCESS);
61 }
62 
63 /* Sync CSR data to device if not yet */
64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
65 {
66   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
67 
68   PetscFunctionBegin;
69   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device");
70   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
71   if (aijkok->a_dual.need_sync_device()) {
72     aijkok->a_dual.sync_device();
73     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
74     aijkok->hermitian_updated = PETSC_FALSE;
75   }
76   PetscFunctionReturn(PETSC_SUCCESS);
77 }
78 
79 /* Mark the CSR data on device as modified */
80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
81 {
82   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
83 
84   PetscFunctionBegin;
85   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
86   aijkok->a_dual.clear_sync_state();
87   aijkok->a_dual.modify_device();
88   aijkok->transpose_updated = PETSC_FALSE;
89   aijkok->hermitian_updated = PETSC_FALSE;
90   PetscCall(MatSeqAIJInvalidateDiagonal(A));
91   PetscCall(PetscObjectStateIncrease((PetscObject)A));
92   PetscFunctionReturn(PETSC_SUCCESS);
93 }
94 
95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
96 {
97   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
98 
99   PetscFunctionBegin;
100   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
101   /* We do not expect one needs factors on host  */
102   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host");
103   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
104   aijkok->a_dual.sync_host();
105   PetscFunctionReturn(PETSC_SUCCESS);
106 }
107 
108 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111 
112   PetscFunctionBegin;
113   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
114     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
115     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
116     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
117   */
118   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
119     aijkok->a_dual.sync_host();
120     *array = aijkok->a_dual.view_host().data();
121   } else { /* Happens when calling MatSetValues on a newly created matrix */
122     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
123   }
124   PetscFunctionReturn(PETSC_SUCCESS);
125 }
126 
127 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
128 {
129   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
130 
131   PetscFunctionBegin;
132   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
133   PetscFunctionReturn(PETSC_SUCCESS);
134 }
135 
136 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
137 {
138   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
139 
140   PetscFunctionBegin;
141   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
142     aijkok->a_dual.sync_host();
143     *array = aijkok->a_dual.view_host().data();
144   } else {
145     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
146   }
147   PetscFunctionReturn(PETSC_SUCCESS);
148 }
149 
150 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
151 {
152   PetscFunctionBegin;
153   *array = NULL;
154   PetscFunctionReturn(PETSC_SUCCESS);
155 }
156 
157 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
158 {
159   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
160 
161   PetscFunctionBegin;
162   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163     *array = aijkok->a_dual.view_host().data();
164   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
165     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
166   }
167   PetscFunctionReturn(PETSC_SUCCESS);
168 }
169 
170 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
171 {
172   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
173 
174   PetscFunctionBegin;
175   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
176     aijkok->a_dual.clear_sync_state();
177     aijkok->a_dual.modify_host();
178   }
179   PetscFunctionReturn(PETSC_SUCCESS);
180 }
181 
182 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
183 {
184   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
185 
186   PetscFunctionBegin;
187   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
188 
189   if (i) *i = aijkok->i_device_data();
190   if (j) *j = aijkok->j_device_data();
191   if (a) {
192     aijkok->a_dual.sync_device();
193     *a = aijkok->a_device_data();
194   }
195   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
200 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
201 {
202   Mat_SeqAIJKokkos                            *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
203   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
204 
205   PetscFunctionBegin;
206   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k);
208   Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k);
209   PetscFunctionReturn(PETSC_SUCCESS);
210 }
211 
212 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
213 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
214 {
215   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
216 
217   PetscFunctionBegin;
218   if (aijkok && aijkok->device_mat_d.data()) {
219     *d_mat = aijkok->device_mat_d.data();
220   } else {
221     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
222     *d_mat = NULL;
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 /* Generate the transpose on device and cache it internally */
228 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
229 {
230   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
231 
232   PetscFunctionBegin;
233   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
234   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
235     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
236     PetscCallCXX(aijkok->a_dual.sync_device());
237     PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat));
238     PetscCallCXX(sort_crs_matrix(aijkok->csrmatT));
239     aijkok->transpose_updated = PETSC_TRUE;
240   }
241   *csrmatT = &aijkok->csrmatT;
242   PetscFunctionReturn(PETSC_SUCCESS);
243 }
244 
245 /* Generate the Hermitian on device and cache it internally */
246 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
247 {
248   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
249 
250   PetscFunctionBegin;
251   PetscCall(PetscLogGpuTimeBegin());
252   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
253   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
254     PetscCallCXX(aijkok->a_dual.sync_device());
255     PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat));
256     PetscCallCXX(sort_crs_matrix(aijkok->csrmatH));
257 #if defined(PETSC_USE_COMPLEX)
258     const auto &a = aijkok->csrmatH.values;
259     Kokkos::parallel_for(
260       a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); });
261 #endif
262     aijkok->hermitian_updated = PETSC_TRUE;
263   }
264   *csrmatH = &aijkok->csrmatH;
265   PetscCall(PetscLogGpuTimeEnd());
266   PetscFunctionReturn(PETSC_SUCCESS);
267 }
268 
269 /* y = A x */
270 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
271 {
272   Mat_SeqAIJKokkos          *aijkok;
273   ConstPetscScalarKokkosView xv;
274   PetscScalarKokkosView      yv;
275 
276   PetscFunctionBegin;
277   PetscCall(PetscLogGpuTimeBegin());
278   PetscCall(MatSeqAIJKokkosSyncDevice(A));
279   PetscCall(VecGetKokkosView(xx, &xv));
280   PetscCall(VecGetKokkosViewWrite(yy, &yv));
281   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
282   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
283   PetscCall(VecRestoreKokkosView(xx, &xv));
284   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
285   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
286   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
287   PetscCall(PetscLogGpuTimeEnd());
288   PetscFunctionReturn(PETSC_SUCCESS);
289 }
290 
291 /* y = A^T x */
292 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
293 {
294   Mat_SeqAIJKokkos          *aijkok;
295   const char                *mode;
296   ConstPetscScalarKokkosView xv;
297   PetscScalarKokkosView      yv;
298   KokkosCsrMatrix           *csrmat;
299 
300   PetscFunctionBegin;
301   PetscCall(PetscLogGpuTimeBegin());
302   PetscCall(MatSeqAIJKokkosSyncDevice(A));
303   PetscCall(VecGetKokkosView(xx, &xv));
304   PetscCall(VecGetKokkosViewWrite(yy, &yv));
305   if (A->form_explicit_transpose) {
306     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
307     mode = "N";
308   } else {
309     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
310     csrmat = &aijkok->csrmat;
311     mode   = "T";
312   }
313   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
314   PetscCall(VecRestoreKokkosView(xx, &xv));
315   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
316   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
317   PetscCall(PetscLogGpuTimeEnd());
318   PetscFunctionReturn(PETSC_SUCCESS);
319 }
320 
321 /* y = A^H x */
322 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
323 {
324   Mat_SeqAIJKokkos          *aijkok;
325   const char                *mode;
326   ConstPetscScalarKokkosView xv;
327   PetscScalarKokkosView      yv;
328   KokkosCsrMatrix           *csrmat;
329 
330   PetscFunctionBegin;
331   PetscCall(PetscLogGpuTimeBegin());
332   PetscCall(MatSeqAIJKokkosSyncDevice(A));
333   PetscCall(VecGetKokkosView(xx, &xv));
334   PetscCall(VecGetKokkosViewWrite(yy, &yv));
335   if (A->form_explicit_transpose) {
336     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
337     mode = "N";
338   } else {
339     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
340     csrmat = &aijkok->csrmat;
341     mode   = "C";
342   }
343   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
344   PetscCall(VecRestoreKokkosView(xx, &xv));
345   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
346   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
347   PetscCall(PetscLogGpuTimeEnd());
348   PetscFunctionReturn(PETSC_SUCCESS);
349 }
350 
351 /* z = A x + y */
352 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
353 {
354   Mat_SeqAIJKokkos          *aijkok;
355   ConstPetscScalarKokkosView xv, yv;
356   PetscScalarKokkosView      zv;
357 
358   PetscFunctionBegin;
359   PetscCall(PetscLogGpuTimeBegin());
360   PetscCall(MatSeqAIJKokkosSyncDevice(A));
361   PetscCall(VecGetKokkosView(xx, &xv));
362   PetscCall(VecGetKokkosView(yy, &yv));
363   PetscCall(VecGetKokkosViewWrite(zz, &zv));
364   if (zz != yy) Kokkos::deep_copy(zv, yv);
365   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
366   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
367   PetscCall(VecRestoreKokkosView(xx, &xv));
368   PetscCall(VecRestoreKokkosView(yy, &yv));
369   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
370   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
371   PetscCall(PetscLogGpuTimeEnd());
372   PetscFunctionReturn(PETSC_SUCCESS);
373 }
374 
375 /* z = A^T x + y */
376 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
377 {
378   Mat_SeqAIJKokkos          *aijkok;
379   const char                *mode;
380   ConstPetscScalarKokkosView xv, yv;
381   PetscScalarKokkosView      zv;
382   KokkosCsrMatrix           *csrmat;
383 
384   PetscFunctionBegin;
385   PetscCall(PetscLogGpuTimeBegin());
386   PetscCall(MatSeqAIJKokkosSyncDevice(A));
387   PetscCall(VecGetKokkosView(xx, &xv));
388   PetscCall(VecGetKokkosView(yy, &yv));
389   PetscCall(VecGetKokkosViewWrite(zz, &zv));
390   if (zz != yy) Kokkos::deep_copy(zv, yv);
391   if (A->form_explicit_transpose) {
392     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
393     mode = "N";
394   } else {
395     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
396     csrmat = &aijkok->csrmat;
397     mode   = "T";
398   }
399   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
400   PetscCall(VecRestoreKokkosView(xx, &xv));
401   PetscCall(VecRestoreKokkosView(yy, &yv));
402   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
403   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
404   PetscCall(PetscLogGpuTimeEnd());
405   PetscFunctionReturn(PETSC_SUCCESS);
406 }
407 
408 /* z = A^H x + y */
409 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
410 {
411   Mat_SeqAIJKokkos          *aijkok;
412   const char                *mode;
413   ConstPetscScalarKokkosView xv, yv;
414   PetscScalarKokkosView      zv;
415   KokkosCsrMatrix           *csrmat;
416 
417   PetscFunctionBegin;
418   PetscCall(PetscLogGpuTimeBegin());
419   PetscCall(MatSeqAIJKokkosSyncDevice(A));
420   PetscCall(VecGetKokkosView(xx, &xv));
421   PetscCall(VecGetKokkosView(yy, &yv));
422   PetscCall(VecGetKokkosViewWrite(zz, &zv));
423   if (zz != yy) Kokkos::deep_copy(zv, yv);
424   if (A->form_explicit_transpose) {
425     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
426     mode = "N";
427   } else {
428     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
429     csrmat = &aijkok->csrmat;
430     mode   = "C";
431   }
432   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
433   PetscCall(VecRestoreKokkosView(xx, &xv));
434   PetscCall(VecRestoreKokkosView(yy, &yv));
435   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
436   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
437   PetscCall(PetscLogGpuTimeEnd());
438   PetscFunctionReturn(PETSC_SUCCESS);
439 }
440 
441 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
442 {
443   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
444 
445   PetscFunctionBegin;
446   switch (op) {
447   case MAT_FORM_EXPLICIT_TRANSPOSE:
448     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
449     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
450     A->form_explicit_transpose = flg;
451     break;
452   default:
453     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
454     break;
455   }
456   PetscFunctionReturn(PETSC_SUCCESS);
457 }
458 
459 /* Depending on reuse, either build a new mat, or use the existing mat */
460 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
461 {
462   Mat_SeqAIJ *aseq;
463 
464   PetscFunctionBegin;
465   PetscCall(PetscKokkosInitializeCheck());
466   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
467     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
468   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
469     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
470   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
471     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
472     PetscCall(PetscFree(A->defaultvectype));
473     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
474     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
475     PetscCall(MatSetOps_SeqAIJKokkos(A));
476     aseq = static_cast<Mat_SeqAIJ *>(A->data);
477     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
478       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
479       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
480     }
481   }
482   PetscFunctionReturn(PETSC_SUCCESS);
483 }
484 
485 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
486    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
487  */
488 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
489 {
490   Mat_SeqAIJ       *bseq;
491   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
492   Mat               mat;
493 
494   PetscFunctionBegin;
495   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
496   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
497   mat = *B;
498   if (A->assembled) {
499     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
500     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
501     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
502     /* Now copy values to B if needed */
503     if (dupOption == MAT_COPY_VALUES) {
504       if (akok->a_dual.need_sync_device()) {
505         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
506         bkok->a_dual.modify_host();
507       } else { /* If device has the latest data, we only copy data on device */
508         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
509         bkok->a_dual.modify_device();
510       }
511     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
512       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
513       bkok->a_dual.modify_host();
514     }
515     mat->spptr = bkok;
516   }
517 
518   PetscCall(PetscFree(mat->defaultvectype));
519   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
520   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
521   PetscCall(MatSetOps_SeqAIJKokkos(mat));
522   PetscFunctionReturn(PETSC_SUCCESS);
523 }
524 
525 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
526 {
527   Mat               At;
528   KokkosCsrMatrix  *internT;
529   Mat_SeqAIJKokkos *atkok, *bkok;
530 
531   PetscFunctionBegin;
532   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
533   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
534   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
535     /* Deep copy internT, as we want to isolate the internal transpose */
536     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT)));
537     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
538     if (reuse == MAT_INITIAL_MATRIX) *B = At;
539     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
540   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
541     if ((*B)->assembled) {
542       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
543       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values));
544       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
545     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
546       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
547       MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */
548       MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz());
549       PetscCallCXX(Kokkos::deep_copy(a_h, internT->values));
550       PetscCallCXX(Kokkos::deep_copy(j_h, internT->graph.entries));
551     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
552   }
553   PetscFunctionReturn(PETSC_SUCCESS);
554 }
555 
556 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
557 {
558   Mat_SeqAIJKokkos *aijkok;
559 
560   PetscFunctionBegin;
561   if (A->factortype == MAT_FACTOR_NONE) {
562     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
563     delete aijkok;
564   } else {
565     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
566   }
567   A->spptr = NULL;
568   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
569   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
570   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
571   PetscCall(MatDestroy_SeqAIJ(A));
572   PetscFunctionReturn(PETSC_SUCCESS);
573 }
574 
575 /*MC
576    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
577 
578    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
579 
580    Options Database Key:
581 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
582 
583   Level: beginner
584 
585 .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
586 M*/
587 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
588 {
589   PetscFunctionBegin;
590   PetscCall(PetscKokkosInitializeCheck());
591   PetscCall(MatCreate_SeqAIJ(A));
592   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
593   PetscFunctionReturn(PETSC_SUCCESS);
594 }
595 
596 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
597 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
598 {
599   Mat_SeqAIJ         *a, *b;
600   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
601   MatScalarKokkosView aa, ba, ca;
602   MatRowMapKokkosView ai, bi, ci;
603   MatColIdxKokkosView aj, bj, cj;
604   PetscInt            m, n, nnz, aN;
605 
606   PetscFunctionBegin;
607   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
608   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
609   PetscValidPointer(C, 4);
610   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
611   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
612   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
613   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
614 
615   PetscCall(MatSeqAIJKokkosSyncDevice(A));
616   PetscCall(MatSeqAIJKokkosSyncDevice(B));
617   a    = static_cast<Mat_SeqAIJ *>(A->data);
618   b    = static_cast<Mat_SeqAIJ *>(B->data);
619   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
620   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
621   aa   = akok->a_dual.view_device();
622   ai   = akok->i_dual.view_device();
623   ba   = bkok->a_dual.view_device();
624   bi   = bkok->i_dual.view_device();
625   m    = A->rmap->n; /* M, N and nnz of C */
626   n    = A->cmap->n + B->cmap->n;
627   nnz  = a->nz + b->nz;
628   aN   = A->cmap->n; /* N of A */
629   if (reuse == MAT_INITIAL_MATRIX) {
630     aj           = akok->j_dual.view_device();
631     bj           = bkok->j_dual.view_device();
632     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
633     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
634     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
635     ca           = ca_dual.view_device();
636     ci           = ci_dual.view_device();
637     cj           = cj_dual.view_device();
638 
639     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
640     Kokkos::parallel_for(
641       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
642         PetscInt i       = t.league_rank(); /* row i */
643         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
644 
645         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
646                                                    ci(i) = coffset;
647                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
648         });
649 
650         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
651           if (k < alen) {
652             ca(coffset + k) = aa(ai(i) + k);
653             cj(coffset + k) = aj(ai(i) + k);
654           } else {
655             ca(coffset + k) = ba(bi(i) + k - alen);
656             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
657           }
658         });
659       });
660     ca_dual.modify_device();
661     ci_dual.modify_device();
662     cj_dual.modify_device();
663     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
664     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
665   } else if (reuse == MAT_REUSE_MATRIX) {
666     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
667     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
668     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
669     ca   = ckok->a_dual.view_device();
670     ci   = ckok->i_dual.view_device();
671 
672     Kokkos::parallel_for(
673       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
674         PetscInt i    = t.league_rank(); /* row i */
675         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
676         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
677           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
678           else ca(ci(i) + k) = ba(bi(i) + k - alen);
679         });
680       });
681     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
682   }
683   PetscFunctionReturn(PETSC_SUCCESS);
684 }
685 
686 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
687 {
688   PetscFunctionBegin;
689   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
690   PetscFunctionReturn(PETSC_SUCCESS);
691 }
692 
693 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
694 {
695   Mat_Product                 *product = C->product;
696   Mat                          A, B;
697   bool                         transA, transB; /* use bool, since KK needs this type */
698   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
699   Mat_SeqAIJ                  *c;
700   MatProductData_SeqAIJKokkos *pdata;
701   KokkosCsrMatrix             *csrmatA, *csrmatB;
702 
703   PetscFunctionBegin;
704   MatCheckProduct(C, 1);
705   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
706   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
707 
708   if (pdata->reusesym) {           /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
709     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
710     PetscFunctionReturn(PETSC_SUCCESS);
711   }
712 
713   switch (product->type) {
714   case MATPRODUCT_AB:
715     transA = false;
716     transB = false;
717     break;
718   case MATPRODUCT_AtB:
719     transA = true;
720     transB = false;
721     break;
722   case MATPRODUCT_ABt:
723     transA = false;
724     transB = true;
725     break;
726   default:
727     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
728   }
729 
730   A = product->A;
731   B = product->B;
732   PetscCall(MatSeqAIJKokkosSyncDevice(A));
733   PetscCall(MatSeqAIJKokkosSyncDevice(B));
734   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
735   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
736   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
737 
738   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
739 
740   csrmatA = &akok->csrmat;
741   csrmatB = &bkok->csrmat;
742 
743   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
744   if (transA) {
745     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
746     transA = false;
747   }
748 
749   if (transB) {
750     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
751     transB = false;
752   }
753   PetscCall(PetscLogGpuTimeBegin());
754   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat));
755 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(3, 7, 99)
756   auto spgemmHandle = pdata->kh.get_spgemm_handle();
757   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
758 #endif
759 
760   PetscCall(PetscLogGpuTimeEnd());
761   PetscCall(MatSeqAIJKokkosModifyDevice(C));
762   /* shorter version of MatAssemblyEnd_SeqAIJ */
763   c = (Mat_SeqAIJ *)C->data;
764   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
765   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
766   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
767   c->reallocs         = 0;
768   C->info.mallocs     = 0;
769   C->info.nz_unneeded = 0;
770   C->assembled = C->was_assembled = PETSC_TRUE;
771   C->num_ass++;
772   PetscFunctionReturn(PETSC_SUCCESS);
773 }
774 
775 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
776 {
777   Mat_Product                 *product = C->product;
778   MatProductType               ptype;
779   Mat                          A, B;
780   bool                         transA, transB;
781   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
782   MatProductData_SeqAIJKokkos *pdata;
783   MPI_Comm                     comm;
784   KokkosCsrMatrix             *csrmatA, *csrmatB, csrmatC;
785 
786   PetscFunctionBegin;
787   MatCheckProduct(C, 1);
788   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
789   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
790   A = product->A;
791   B = product->B;
792   PetscCall(MatSeqAIJKokkosSyncDevice(A));
793   PetscCall(MatSeqAIJKokkosSyncDevice(B));
794   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
795   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
796   csrmatA = &akok->csrmat;
797   csrmatB = &bkok->csrmat;
798 
799   ptype = product->type;
800   switch (ptype) {
801   case MATPRODUCT_AB:
802     transA = false;
803     transB = false;
804     break;
805   case MATPRODUCT_AtB:
806     transA = true;
807     transB = false;
808     break;
809   case MATPRODUCT_ABt:
810     transA = false;
811     transB = true;
812     break;
813   default:
814     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
815   }
816 
817   product->data = pdata = new MatProductData_SeqAIJKokkos();
818   pdata->kh.set_team_work_size(16);
819   pdata->kh.set_dynamic_scheduling(true);
820   pdata->reusesym = product->api_user;
821 
822   /* TODO: add command line options to select spgemm algorithms */
823   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
824 
825   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
826 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
827   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
828   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
829   #endif
830 #endif
831 
832   pdata->kh.create_spgemm_handle(spgemm_alg);
833 
834   PetscCall(PetscLogGpuTimeBegin());
835   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
836   if (transA) {
837     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
838     transA = false;
839   }
840 
841   if (transB) {
842     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
843     transB = false;
844   }
845 
846   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC));
847 
848   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
849     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
850     calling new Mat_SeqAIJKokkos().
851     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
852   */
853   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC));
854 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(3, 7, 99)
855   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
856   auto spgemmHandle = pdata->kh.get_spgemm_handle();
857   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
858 #endif
859   PetscCall(PetscLogGpuTimeEnd());
860 
861   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
862   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
863   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
864   PetscFunctionReturn(PETSC_SUCCESS);
865 }
866 
867 /* handles sparse matrix matrix ops */
868 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
869 {
870   Mat_Product *product = mat->product;
871   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
872 
873   PetscFunctionBegin;
874   MatCheckProduct(mat, 1);
875   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
876   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
877   if (Biskok && Ciskok) {
878     switch (product->type) {
879     case MATPRODUCT_AB:
880     case MATPRODUCT_AtB:
881     case MATPRODUCT_ABt:
882       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
883       break;
884     case MATPRODUCT_PtAP:
885     case MATPRODUCT_RARt:
886     case MATPRODUCT_ABC:
887       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
888       break;
889     default:
890       break;
891     }
892   } else { /* fallback for AIJ */
893     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
894   }
895   PetscFunctionReturn(PETSC_SUCCESS);
896 }
897 
898 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
899 {
900   Mat_SeqAIJKokkos *aijkok;
901 
902   PetscFunctionBegin;
903   PetscCall(PetscLogGpuTimeBegin());
904   PetscCall(MatSeqAIJKokkosSyncDevice(A));
905   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
906   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
907   PetscCall(MatSeqAIJKokkosModifyDevice(A));
908   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
909   PetscCall(PetscLogGpuTimeEnd());
910   PetscFunctionReturn(PETSC_SUCCESS);
911 }
912 
913 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
914 {
915   Mat_SeqAIJKokkos *aijkok;
916 
917   PetscFunctionBegin;
918   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
919   if (aijkok) { /* Only zero the device if data is already there */
920     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
921     PetscCall(MatSeqAIJKokkosModifyDevice(A));
922   } else { /* Might be preallocated but not assembled */
923     PetscCall(MatZeroEntries_SeqAIJ(A));
924   }
925   PetscFunctionReturn(PETSC_SUCCESS);
926 }
927 
928 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
929 {
930   Mat_SeqAIJ           *aijseq;
931   Mat_SeqAIJKokkos     *aijkok;
932   PetscInt              n;
933   PetscScalarKokkosView xv;
934 
935   PetscFunctionBegin;
936   PetscCall(VecGetLocalSize(x, &n));
937   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
938   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
939 
940   PetscCall(MatSeqAIJKokkosSyncDevice(A));
941   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
942 
943   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
944     PetscCall(MatMarkDiagonal_SeqAIJ(A));
945     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
946     aijkok->SetDiagonal(aijseq->diag);
947   }
948 
949   const auto &Aa    = aijkok->a_dual.view_device();
950   const auto &Ai    = aijkok->i_dual.view_device();
951   const auto &Adiag = aijkok->diag_dual.view_device();
952 
953   PetscCall(VecGetKokkosViewWrite(x, &xv));
954   Kokkos::parallel_for(
955     n, KOKKOS_LAMBDA(const PetscInt i) {
956       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
957       else xv(i) = 0;
958     });
959   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
960   PetscFunctionReturn(PETSC_SUCCESS);
961 }
962 
963 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
964 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
965 {
966   Mat_SeqAIJKokkos *aijkok;
967 
968   PetscFunctionBegin;
969   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
970   PetscValidPointer(kv, 2);
971   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
972   PetscCall(MatSeqAIJKokkosSyncDevice(A));
973   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
974   *kv    = aijkok->a_dual.view_device();
975   PetscFunctionReturn(PETSC_SUCCESS);
976 }
977 
978 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
979 {
980   PetscFunctionBegin;
981   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
982   PetscValidPointer(kv, 2);
983   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
984   PetscFunctionReturn(PETSC_SUCCESS);
985 }
986 
987 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
988 {
989   Mat_SeqAIJKokkos *aijkok;
990 
991   PetscFunctionBegin;
992   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
993   PetscValidPointer(kv, 2);
994   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
995   PetscCall(MatSeqAIJKokkosSyncDevice(A));
996   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
997   *kv    = aijkok->a_dual.view_device();
998   PetscFunctionReturn(PETSC_SUCCESS);
999 }
1000 
1001 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1002 {
1003   PetscFunctionBegin;
1004   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1005   PetscValidPointer(kv, 2);
1006   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1007   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1008   PetscFunctionReturn(PETSC_SUCCESS);
1009 }
1010 
1011 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1012 {
1013   Mat_SeqAIJKokkos *aijkok;
1014 
1015   PetscFunctionBegin;
1016   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1017   PetscValidPointer(kv, 2);
1018   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1019   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1020   *kv    = aijkok->a_dual.view_device();
1021   PetscFunctionReturn(PETSC_SUCCESS);
1022 }
1023 
1024 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1025 {
1026   PetscFunctionBegin;
1027   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1028   PetscValidPointer(kv, 2);
1029   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1030   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1031   PetscFunctionReturn(PETSC_SUCCESS);
1032 }
1033 
1034 /* Computes Y += alpha X */
1035 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1036 {
1037   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1038   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1039   ConstMatScalarKokkosView Xa;
1040   MatScalarKokkosView      Ya;
1041 
1042   PetscFunctionBegin;
1043   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1044   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1045   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1046   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1047   PetscCall(PetscLogGpuTimeBegin());
1048 
1049   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1050     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
1051     PetscBool e;
1052     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1053     if (e) {
1054       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1055       if (e) pattern = SAME_NONZERO_PATTERN;
1056     }
1057   }
1058 
1059   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1060     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1061     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1062     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1063   */
1064   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1065   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1066   Xa   = xkok->a_dual.view_device();
1067   Ya   = ykok->a_dual.view_device();
1068 
1069   if (pattern == SAME_NONZERO_PATTERN) {
1070     KokkosBlas::axpy(alpha, Xa, Ya);
1071     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1072   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1073     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1074     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1075 
1076     Kokkos::parallel_for(
1077       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1078         PetscInt i = t.league_rank();              /* row i */
1079         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */
1080                                                    PetscInt p, q = Yi(i);
1081                                                    for (p = Xi(i); p < Xi(i + 1); p++) {          /* For each nonzero on row i of X */
1082                                                      while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */
1083                                                      if (Xj(p) == Yj(q)) {                        /* Found it */
1084                                                        Ya(q) += alpha * Xa(p);
1085                                                        q++;
1086                                                      } else {
1087                                                        /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1088                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1089             */
1090                                                        if (Yi(i) != Yi(i + 1))
1091                                                          Ya(Yi(i)) =
1092 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99)
1093                                                            Kokkos::ArithTraits<PetscScalar>::nan();
1094 #else
1095               Kokkos::Experimental::nan("1");
1096 #endif
1097                                                      }
1098                                                    }
1099         });
1100       });
1101     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1102   } else { /* different nonzero patterns */
1103     Mat             Z;
1104     KokkosCsrMatrix zcsr;
1105     KernelHandle    kh;
1106     kh.create_spadd_handle(false);
1107     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1108     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1109     zkok = new Mat_SeqAIJKokkos(zcsr);
1110     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1111     PetscCall(MatHeaderReplace(Y, &Z));
1112     kh.destroy_spadd_handle();
1113   }
1114   PetscCall(PetscLogGpuTimeEnd());
1115   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); /* Because we scaled X and then added it to Y */
1116   PetscFunctionReturn(PETSC_SUCCESS);
1117 }
1118 
1119 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1120 {
1121   Mat_SeqAIJKokkos *akok;
1122   Mat_SeqAIJ       *aseq;
1123 
1124   PetscFunctionBegin;
1125   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1126   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1127   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1128   delete akok;
1129   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1130   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1131   akok->SetUpCOO(aseq);
1132   PetscFunctionReturn(PETSC_SUCCESS);
1133 }
1134 
1135 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1136 {
1137   Mat_SeqAIJ                 *aseq = static_cast<Mat_SeqAIJ *>(A->data);
1138   Mat_SeqAIJKokkos           *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1139   PetscCount                  Annz = aseq->nz;
1140   const PetscCountKokkosView &jmap = akok->jmap_d;
1141   const PetscCountKokkosView &perm = akok->perm_d;
1142   MatScalarKokkosView         Aa;
1143   ConstMatScalarKokkosView    kv;
1144   PetscMemType                memtype;
1145 
1146   PetscFunctionBegin;
1147   PetscCall(PetscGetMemType(v, &memtype));
1148   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1149     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
1150   } else {
1151     kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
1152   }
1153 
1154   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1155   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1156 
1157   Kokkos::parallel_for(
1158     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1159       PetscScalar sum = 0.0;
1160       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1161       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1162     });
1163 
1164   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1165   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1166   PetscFunctionReturn(PETSC_SUCCESS);
1167 }
1168 
1169 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag)
1170 {
1171   Mat_SeqAIJKokkos          *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1172   MatScalarKokkosView        Aa;
1173   const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
1174   PetscInt                   m  = A->rmap->n;
1175   ConstMatRowMapKokkosView   Adiag(diag, m); /* diag is a device pointer */
1176 
1177   PetscFunctionBegin;
1178   PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa));
1179   Kokkos::parallel_for(
1180     m, KOKKOS_LAMBDA(const PetscInt i) {
1181       PetscScalar tmp;
1182       if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
1183         tmp          = Aa(Ai(i));
1184         Aa(Ai(i))    = Aa(Adiag(i));
1185         Aa(Adiag(i)) = tmp;
1186       }
1187     });
1188   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1189   PetscFunctionReturn(PETSC_SUCCESS);
1190 }
1191 
1192 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1193 {
1194   PetscFunctionBegin;
1195   PetscCall(MatSeqAIJKokkosSyncHost(A));
1196   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1197   B->offloadmask = PETSC_OFFLOAD_CPU;
1198   PetscFunctionReturn(PETSC_SUCCESS);
1199 }
1200 
1201 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1202 {
1203   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1204 
1205   PetscFunctionBegin;
1206   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1207   A->boundtocpu  = PETSC_FALSE;
1208 
1209   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1210   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1211   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1212   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1213   A->ops->scale                     = MatScale_SeqAIJKokkos;
1214   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1215   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1216   A->ops->mult                      = MatMult_SeqAIJKokkos;
1217   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1218   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1219   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1220   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1221   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1222   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1223   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1224   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1225   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1226   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1227   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1228   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1229   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1230   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1231   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1232   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1233 
1234   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1235   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1236   PetscFunctionReturn(PETSC_SUCCESS);
1237 }
1238 
1239 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1240 {
1241   Mat_SeqAIJ *aseq;
1242   PetscInt    i, m, n;
1243 
1244   PetscFunctionBegin;
1245   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1246 
1247   m = akok->nrows();
1248   n = akok->ncols();
1249   PetscCall(MatSetSizes(A, m, n, m, n));
1250   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1251 
1252   /* Set up data structures of A as a MATSEQAIJ */
1253   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1254   aseq = (Mat_SeqAIJ *)(A)->data;
1255 
1256   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1257   akok->j_dual.sync_host();
1258 
1259   aseq->i            = akok->i_host_data();
1260   aseq->j            = akok->j_host_data();
1261   aseq->a            = akok->a_host_data();
1262   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1263   aseq->singlemalloc = PETSC_FALSE;
1264   aseq->free_a       = PETSC_FALSE;
1265   aseq->free_ij      = PETSC_FALSE;
1266   aseq->nz           = akok->nnz();
1267   aseq->maxnz        = aseq->nz;
1268 
1269   PetscCall(PetscMalloc1(m, &aseq->imax));
1270   PetscCall(PetscMalloc1(m, &aseq->ilen));
1271   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1272 
1273   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1274   akok->nonzerostate = A->nonzerostate;
1275   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1276   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1277   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1278   PetscFunctionReturn(PETSC_SUCCESS);
1279 }
1280 
1281 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1282 
1283    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1284  */
1285 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1286 {
1287   PetscFunctionBegin;
1288   PetscCall(MatCreate(comm, A));
1289   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1290   PetscFunctionReturn(PETSC_SUCCESS);
1291 }
1292 
1293 /*@C
1294    MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1295    (the default parallel PETSc format). This matrix will ultimately be handled by
1296    Kokkos for calculations. For good matrix
1297    assembly performance the user should preallocate the matrix storage by setting
1298    the parameter nz (or the array nnz).  By setting these parameters accurately,
1299    performance during matrix assembly can be increased by more than a factor of 50.
1300 
1301    Collective
1302 
1303    Input Parameters:
1304 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
1305 .  m - number of rows
1306 .  n - number of columns
1307 .  nz - number of nonzeros per row (same for all rows)
1308 -  nnz - array containing the number of nonzeros in the various rows
1309          (possibly different for each row) or `NULL`
1310 
1311    Output Parameter:
1312 .  A - the matrix
1313 
1314    Level: intermediate
1315 
1316    Notes:
1317    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1318    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1319    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1320 
1321    If `nnz` is given then `nz` is ignored
1322 
1323    The AIJ format, also called
1324    compressed row storage, is fully compatible with standard Fortran
1325    storage.  That is, the stored row and column indices can begin at
1326    either one (as in Fortran) or zero.  See the users' manual for details.
1327 
1328    Specify the preallocated storage with either `nz` or `nnz` (not both).
1329    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1330    allocation.
1331 
1332    By default, this format uses inodes (identical nodes) when possible, to
1333    improve numerical efficiency of matrix-vector products and solves. We
1334    search for consecutive rows with the same nonzero structure, thereby
1335    reusing matrix information to achieve increased efficiency.
1336 
1337 .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1338 @*/
1339 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1340 {
1341   PetscFunctionBegin;
1342   PetscCall(PetscKokkosInitializeCheck());
1343   PetscCall(MatCreate(comm, A));
1344   PetscCall(MatSetSizes(*A, m, n, m, n));
1345   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1346   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1347   PetscFunctionReturn(PETSC_SUCCESS);
1348 }
1349 
1350 typedef Kokkos::TeamPolicy<>::member_type team_member;
1351 //
1352 // This factorization exploits block diagonal matrices with "Nf" (not used).
1353 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1354 //
1355 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info)
1356 {
1357   Mat_SeqAIJ       *b      = (Mat_SeqAIJ *)B->data;
1358   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1359   IS                isrow = b->row, isicol = b->icol;
1360   const PetscInt   *r_h, *ic_h;
1361   const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1362   const PetscScalar *aa_d = aijkok->a_dual.view_device().data();
1363   PetscScalar       *ba_d = baijkok->a_dual.view_device().data();
1364   PetscBool          row_identity, col_identity;
1365   PetscInt           nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used
1366 
1367   PetscFunctionBegin;
1368   PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n);
1369   PetscCall(MatIsStructurallySymmetric(A, &row_identity));
1370   PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported");
1371   PetscCall(ISGetIndices(isrow, &r_h));
1372   PetscCall(ISGetIndices(isicol, &ic_h));
1373   PetscCall(ISGetSize(isicol, &nc));
1374   PetscCall(PetscLogGpuTimeBegin());
1375   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1376   {
1377 #define KOKKOS_SHARED_LEVEL 1
1378     using scr_mem_t    = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1379     using sizet_scr_t  = Kokkos::View<size_t, scr_mem_t>;
1380     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1381     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n);
1382     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_r_k("r", n);
1383     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc);
1384     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_ic_k("ic", nc);
1385     size_t                                                                                                               flops_h = 0.0;
1386     Kokkos::View<size_t, Kokkos::HostSpace>                                                                              h_flops_k(&flops_h);
1387     Kokkos::View<size_t>                                                                                                 d_flops_k("flops");
1388     const int                                                                                                            conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1389     const int                                                                                                            nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1;
1390     Kokkos::deep_copy(d_flops_k, h_flops_k);
1391     Kokkos::deep_copy(d_r_k, h_r_k);
1392     Kokkos::deep_copy(d_ic_k, h_ic_k);
1393     // Fill A --> fact
1394     Kokkos::parallel_for(
1395       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) {
1396         const PetscInt  field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA
1397         const PetscInt  nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
1398         const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data();
1399         // zero rows of B
1400         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1401           PetscInt     nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag
1402           PetscScalar *baL = ba_d + bi_d[rowb];
1403           PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1;
1404           /* zero (unfactored row) */
1405           for (int j = 0; j < nzbL; j++) baL[j] = 0;
1406           for (int j = 0; j < nzbU; j++) baU[j] = 0;
1407         });
1408         // copy A into B
1409         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1410           PetscInt           rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
1411           const PetscScalar *av    = aa_d + ai_d[rowa];
1412           const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1413           /* load in initial (unfactored row) */
1414           for (int j = 0; j < nza; j++) {
1415             PetscInt    colb = ic[ajtmp[j]];
1416             PetscScalar vala = av[j];
1417             if (colb == rowb) {
1418               *(ba_d + bdiag_d[rowb]) = vala;
1419             } else {
1420               const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1421               PetscScalar    *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1422               PetscInt        nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0;
1423               for (int j = 0; j < nz; j++) {
1424                 if (pbj[j] == colb) {
1425                   pba[j] = vala;
1426                   set++;
1427                   break;
1428                 }
1429               }
1430 #if !defined(PETSC_HAVE_SYCL)
1431               if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1432 #endif
1433             }
1434           }
1435         });
1436       });
1437     Kokkos::fence();
1438 
1439     Kokkos::parallel_for(
1440       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) {
1441         sizet_scr_t    colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1442         scalar_scr_t   L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1443         sizet_scr_t    flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1444         const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA
1445         const PetscInt start = field * nloc, end = start + nloc;
1446         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1447         // A22 panel update for each row A(1,:) and col A(:,1)
1448         for (int ii = start; ii < end - 1; ii++) {
1449           const PetscInt    *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end)
1450           const PetscScalar *baUi    = ba_d + bdiag_d[ii + 1] + 1;                                          // vector of data  U(i,i+1:end)
1451           const PetscInt     nUi_its = nzUi / Ni + !!(nzUi % Ni);
1452           const PetscScalar  Bii     = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1453           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) {
1454             PetscInt kIdx = j * Ni + field_block_idx;
1455             if (kIdx >= nzUi) /* void */
1456               ;
1457             else {
1458               const PetscInt  myk = bjUi[kIdx];                // assume symmetric structure, need a transposed meta-data here in general
1459               const PetscInt *pjL = bj_d + bi_d[myk];          // look for L(myk,ii) in start of row
1460               const PetscInt  nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:)
1461               size_t          st_idx;
1462               // find and do L(k,i) = A(:k,i) / A(i,i)
1463               Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1464               // get column, there has got to be a better way
1465               Kokkos::parallel_reduce(
1466                 Kokkos::ThreadVectorRange(team, nzL),
1467                 [&](const int &j, size_t &idx) {
1468                   if (pjL[j] == ii) {
1469                     PetscScalar *pLki = ba_d + bi_d[myk] + j;
1470                     idx               = j;           // output
1471                     *pLki             = *pLki / Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1472                   }
1473                 },
1474                 st_idx);
1475               Kokkos::single(Kokkos::PerThread(team), [=]() {
1476                 colkIdx() = st_idx;
1477                 L_ki()    = *(ba_d + bi_d[myk] + st_idx);
1478               });
1479 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1480               if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register
1481 #endif
1482               // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1483               // U(i+1,:end)
1484               Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U)
1485                 PetscScalar Uij = baUi[uiIdx];
1486                 PetscInt    col = bjUi[uiIdx];
1487                 if (col == myk) {
1488                   // A_kk = A_kk - L_ki * U_ij(k)
1489                   PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1490                   *Akkv             = *Akkv - L_ki() * Uij;  // UiK
1491                 } else {
1492                   PetscScalar    *start, *end, *pAkjv = NULL;
1493                   PetscInt        high, low;
1494                   const PetscInt *startj;
1495                   if (col < myk) { // L
1496                     PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1497                     PetscInt     idx  = (pLki + 1) - (ba_d + bi_d[myk]); // index into row
1498                     start             = pLki + 1;                        // start at pLki+1, A22(myk,1)
1499                     startj            = bj_d + bi_d[myk] + idx;
1500                     end               = ba_d + bi_d[myk + 1];
1501                   } else {
1502                     PetscInt idx = bdiag_d[myk + 1] + 1;
1503                     start        = ba_d + idx;
1504                     startj       = bj_d + idx;
1505                     end          = ba_d + bdiag_d[myk];
1506                   }
1507                   // search for 'col', use bisection search - TODO
1508                   low  = 0;
1509                   high = (PetscInt)(end - start);
1510                   while (high - low > 5) {
1511                     int t = (low + high) / 2;
1512                     if (startj[t] > col) high = t;
1513                     else low = t;
1514                   }
1515                   for (pAkjv = start + low; pAkjv < start + high; pAkjv++) {
1516                     if (startj[pAkjv - start] == col) break;
1517                   }
1518 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1519                   if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register
1520 #endif
1521                   *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1522                 }
1523               });
1524             }
1525           });
1526           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1527           if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); });
1528         } /* endof for (i=0; i<n; i++) { */
1529         Kokkos::single(Kokkos::PerTeam(team), [=]() {
1530           Kokkos::atomic_add(&d_flops_k(), flops());
1531           flops() = 0;
1532         });
1533       });
1534     Kokkos::fence();
1535     Kokkos::deep_copy(h_flops_k, d_flops_k);
1536     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1537     Kokkos::parallel_for(
1538       Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) {
1539         const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni;                            //, field_offset = lg_rank%Ni;
1540         const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters
1541         /* Invert diagonal for simpler triangular solves */
1542         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) {
1543           int i = start + outer_index * Ni + lg_rank % Ni;
1544           if (i < end) {
1545             PetscScalar *pv = ba_d + bdiag_d[i];
1546             *pv             = 1.0 / (*pv);
1547           }
1548         });
1549       });
1550   }
1551   PetscCall(PetscLogGpuTimeEnd());
1552   PetscCall(ISRestoreIndices(isicol, &ic_h));
1553   PetscCall(ISRestoreIndices(isrow, &r_h));
1554 
1555   PetscCall(ISIdentity(isrow, &row_identity));
1556   PetscCall(ISIdentity(isicol, &col_identity));
1557   if (b->inode.size) {
1558     B->ops->solve = MatSolve_SeqAIJ_Inode;
1559   } else if (row_identity && col_identity) {
1560     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1561   } else {
1562     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1563   }
1564   B->offloadmask = PETSC_OFFLOAD_GPU;
1565   PetscCall(MatSeqAIJKokkosSyncHost(B));          // solve on CPU
1566   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1567   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1568   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1569   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1570   B->assembled              = PETSC_TRUE;
1571   B->preallocated           = PETSC_TRUE;
1572 
1573   PetscFunctionReturn(PETSC_SUCCESS);
1574 }
1575 
1576 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1577 {
1578   PetscFunctionBegin;
1579   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1580   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1581   PetscFunctionReturn(PETSC_SUCCESS);
1582 }
1583 
1584 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1585 {
1586   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1587 
1588   PetscFunctionBegin;
1589   if (!factors->sptrsv_symbolic_completed) {
1590     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1591     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1592     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1593   }
1594   PetscFunctionReturn(PETSC_SUCCESS);
1595 }
1596 
1597 /* Check if we need to update factors etc for transpose solve */
1598 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1599 {
1600   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1601   MatColIdxType               n       = A->rmap->n;
1602 
1603   PetscFunctionBegin;
1604   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1605     /* Update L^T and do sptrsv symbolic */
1606     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1);
1607     Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */
1608     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0));
1609     factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0));
1610 
1611     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1612                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1613 
1614     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1615       We have to sort the indices, until KK provides finer control options.
1616     */
1617     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1618 
1619     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1620 
1621     /* Update U^T and do sptrsv symbolic */
1622     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1);
1623     Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */
1624     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0));
1625     factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0));
1626 
1627     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1628                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1629 
1630     /* Sort indices. See comments above */
1631     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1632 
1633     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1634     factors->transpose_updated = PETSC_TRUE;
1635   }
1636   PetscFunctionReturn(PETSC_SUCCESS);
1637 }
1638 
1639 /* Solve Ax = b, with A = LU */
1640 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1641 {
1642   ConstPetscScalarKokkosView  bv;
1643   PetscScalarKokkosView       xv;
1644   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1645 
1646   PetscFunctionBegin;
1647   PetscCall(PetscLogGpuTimeBegin());
1648   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1649   PetscCall(VecGetKokkosView(b, &bv));
1650   PetscCall(VecGetKokkosViewWrite(x, &xv));
1651   /* Solve L tmpv = b */
1652   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1653   /* Solve Ux = tmpv */
1654   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1655   PetscCall(VecRestoreKokkosView(b, &bv));
1656   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1657   PetscCall(PetscLogGpuTimeEnd());
1658   PetscFunctionReturn(PETSC_SUCCESS);
1659 }
1660 
1661 /* Solve A^T x = b, where A^T = U^T L^T */
1662 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1663 {
1664   ConstPetscScalarKokkosView  bv;
1665   PetscScalarKokkosView       xv;
1666   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1667 
1668   PetscFunctionBegin;
1669   PetscCall(PetscLogGpuTimeBegin());
1670   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1671   PetscCall(VecGetKokkosView(b, &bv));
1672   PetscCall(VecGetKokkosViewWrite(x, &xv));
1673   /* Solve U^T tmpv = b */
1674   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1675 
1676   /* Solve L^T x = tmpv */
1677   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1678   PetscCall(VecRestoreKokkosView(b, &bv));
1679   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1680   PetscCall(PetscLogGpuTimeEnd());
1681   PetscFunctionReturn(PETSC_SUCCESS);
1682 }
1683 
1684 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1685 {
1686   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1687   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1688   PetscInt                    fill_lev = info->levels;
1689 
1690   PetscFunctionBegin;
1691   PetscCall(PetscLogGpuTimeBegin());
1692   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1693 
1694   auto a_d = aijkok->a_dual.view_device();
1695   auto i_d = aijkok->i_dual.view_device();
1696   auto j_d = aijkok->j_dual.view_device();
1697 
1698   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1699 
1700   B->assembled              = PETSC_TRUE;
1701   B->preallocated           = PETSC_TRUE;
1702   B->ops->solve             = MatSolve_SeqAIJKokkos;
1703   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1704   B->ops->matsolve          = NULL;
1705   B->ops->matsolvetranspose = NULL;
1706   B->offloadmask            = PETSC_OFFLOAD_GPU;
1707 
1708   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1709   factors->transpose_updated         = PETSC_FALSE;
1710   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1711   /* TODO: log flops, but how to know that? */
1712   PetscCall(PetscLogGpuTimeEnd());
1713   PetscFunctionReturn(PETSC_SUCCESS);
1714 }
1715 
1716 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1717 {
1718   Mat_SeqAIJKokkos           *aijkok;
1719   Mat_SeqAIJ                 *b;
1720   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1721   PetscInt                    fill_lev = info->levels;
1722   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1723   PetscInt                    n        = A->rmap->n;
1724 
1725   PetscFunctionBegin;
1726   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1727   /* Rebuild factors */
1728   if (factors) {
1729     factors->Destroy();
1730   } /* Destroy the old if it exists */
1731   else {
1732     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1733   }
1734 
1735   /* Create a spiluk handle and then do symbolic factorization */
1736   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1737   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1738 
1739   auto spiluk_handle = factors->kh.get_spiluk_handle();
1740 
1741   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1742   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1743   Kokkos::realloc(factors->iU_d, n + 1);
1744   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1745 
1746   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1747   auto i_d = aijkok->i_dual.view_device();
1748   auto j_d = aijkok->j_dual.view_device();
1749   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1750   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1751 
1752   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1753   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1754   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1755   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1756 
1757   /* TODO: add options to select sptrsv algorithms */
1758   /* Create sptrsv handles for L, U and their transpose */
1759 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1760   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1761 #else
1762   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1763 #endif
1764 
1765   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1766   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1767   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1768   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1769 
1770   /* Fill fields of the factor matrix B */
1771   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1772   b     = (Mat_SeqAIJ *)B->data;
1773   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1774   B->info.fill_ratio_given  = info->fill;
1775   B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA);
1776 
1777   B->offloadmask          = PETSC_OFFLOAD_GPU;
1778   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1779   PetscFunctionReturn(PETSC_SUCCESS);
1780 }
1781 
1782 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1783 {
1784   Mat_SeqAIJ    *b     = (Mat_SeqAIJ *)B->data;
1785   const PetscInt nrows = A->rmap->n;
1786 
1787   PetscFunctionBegin;
1788   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1789   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1790   // move B data into Kokkos
1791   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1792   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1793   {
1794     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1795     if (!baijkok->diag_d.extent(0)) {
1796       const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1);
1797       baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag));
1798       Kokkos::deep_copy(baijkok->diag_d, h_diag);
1799     }
1800   }
1801   PetscFunctionReturn(PETSC_SUCCESS);
1802 }
1803 
1804 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1805 {
1806   PetscFunctionBegin;
1807   *type = MATSOLVERKOKKOS;
1808   PetscFunctionReturn(PETSC_SUCCESS);
1809 }
1810 
1811 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type)
1812 {
1813   PetscFunctionBegin;
1814   *type = MATSOLVERKOKKOSDEVICE;
1815   PetscFunctionReturn(PETSC_SUCCESS);
1816 }
1817 
1818 /*MC
1819   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1820   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1821 
1822   Level: beginner
1823 
1824 .seealso: [](chapter_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1825 M*/
1826 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1827 {
1828   PetscInt n = A->rmap->n;
1829 
1830   PetscFunctionBegin;
1831   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1832   PetscCall(MatSetSizes(*B, n, n, n, n));
1833   (*B)->factortype = ftype;
1834   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1835   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1836 
1837   if (ftype == MAT_FACTOR_LU) {
1838     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1839     (*B)->canuseordering        = PETSC_TRUE;
1840     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1841   } else if (ftype == MAT_FACTOR_ILU) {
1842     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1843     (*B)->canuseordering         = PETSC_FALSE;
1844     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1845   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1846 
1847   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1848   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1849   PetscFunctionReturn(PETSC_SUCCESS);
1850 }
1851 
1852 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B)
1853 {
1854   PetscInt n = A->rmap->n;
1855 
1856   PetscFunctionBegin;
1857   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1858   PetscCall(MatSetSizes(*B, n, n, n, n));
1859   (*B)->factortype     = ftype;
1860   (*B)->canuseordering = PETSC_TRUE;
1861   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1862   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1863 
1864   if (ftype == MAT_FACTOR_LU) {
1865     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1866     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1867   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types");
1868 
1869   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1870   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device));
1871   PetscFunctionReturn(PETSC_SUCCESS);
1872 }
1873 
1874 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1875 {
1876   PetscFunctionBegin;
1877   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1878   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1879   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device));
1880   PetscFunctionReturn(PETSC_SUCCESS);
1881 }
1882 
1883 /* Utility to print out a KokkosCsrMatrix for debugging */
1884 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1885 {
1886   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1887   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1888   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1889   const PetscInt    *i  = iv.data();
1890   const PetscInt    *j  = jv.data();
1891   const PetscScalar *a  = av.data();
1892   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1893 
1894   PetscFunctionBegin;
1895   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1896   for (PetscInt k = 0; k < m; k++) {
1897     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1898     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1899     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1900   }
1901   PetscFunctionReturn(PETSC_SUCCESS);
1902 }
1903