xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 43cdf1ebbcbfa05bee08e48007ef1bae3f20f4e9)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
18 
19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99)
20   #include <KokkosSparse_Utils.hpp>
21 using KokkosSparse::sort_crs_matrix;
22 using KokkosSparse::Impl::transpose_matrix;
23 #else
24   #include <KokkosKernels_Sorting.hpp>
25 using KokkosKernels::sort_crs_matrix;
26 using KokkosKernels::Impl::transpose_matrix;
27 #endif
28 
29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
30 
31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33    In the latter case, it is important to set a_dual's sync state correctly.
34  */
35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
36 {
37   Mat_SeqAIJ       *aijseq;
38   Mat_SeqAIJKokkos *aijkok;
39 
40   PetscFunctionBegin;
41   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
42   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
43 
44   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
45   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
46 
47   /* If aijkok does not exist, we just copy i, j to device.
48      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
49      In both cases, we build a new aijkok structure.
50   */
51   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
52     delete aijkok;
53     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
54     A->spptr = aijkok;
55   }
56 
57   if (aijkok->device_mat_d.data()) {
58     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
59   }
60   PetscFunctionReturn(0);
61 }
62 
63 /* Sync CSR data to device if not yet */
64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
65 {
66   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
67 
68   PetscFunctionBegin;
69   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device");
70   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
71   if (aijkok->a_dual.need_sync_device()) {
72     aijkok->a_dual.sync_device();
73     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
74     aijkok->hermitian_updated = PETSC_FALSE;
75   }
76   PetscFunctionReturn(0);
77 }
78 
79 /* Mark the CSR data on device as modified */
80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
81 {
82   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
83 
84   PetscFunctionBegin;
85   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
86   aijkok->a_dual.clear_sync_state();
87   aijkok->a_dual.modify_device();
88   aijkok->transpose_updated = PETSC_FALSE;
89   aijkok->hermitian_updated = PETSC_FALSE;
90   PetscCall(MatSeqAIJInvalidateDiagonal(A));
91   PetscCall(PetscObjectStateIncrease((PetscObject)A));
92   PetscFunctionReturn(0);
93 }
94 
95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
96 {
97   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
98 
99   PetscFunctionBegin;
100   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
101   /* We do not expect one needs factors on host  */
102   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host");
103   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
104   aijkok->a_dual.sync_host();
105   PetscFunctionReturn(0);
106 }
107 
108 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111 
112   PetscFunctionBegin;
113   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
114     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
115     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
116     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
117   */
118   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
119     aijkok->a_dual.sync_host();
120     *array = aijkok->a_dual.view_host().data();
121   } else { /* Happens when calling MatSetValues on a newly created matrix */
122     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
123   }
124   PetscFunctionReturn(0);
125 }
126 
127 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
128 {
129   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
130 
131   PetscFunctionBegin;
132   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
133   PetscFunctionReturn(0);
134 }
135 
136 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
137 {
138   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
139 
140   PetscFunctionBegin;
141   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
142     aijkok->a_dual.sync_host();
143     *array = aijkok->a_dual.view_host().data();
144   } else {
145     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
146   }
147   PetscFunctionReturn(0);
148 }
149 
150 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
151 {
152   PetscFunctionBegin;
153   *array = NULL;
154   PetscFunctionReturn(0);
155 }
156 
157 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
158 {
159   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
160 
161   PetscFunctionBegin;
162   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163     *array = aijkok->a_dual.view_host().data();
164   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
165     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
166   }
167   PetscFunctionReturn(0);
168 }
169 
170 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
171 {
172   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
173 
174   PetscFunctionBegin;
175   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
176     aijkok->a_dual.clear_sync_state();
177     aijkok->a_dual.modify_host();
178   }
179   PetscFunctionReturn(0);
180 }
181 
182 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
183 {
184   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
185 
186   PetscFunctionBegin;
187   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
188 
189   if (i) *i = aijkok->i_device_data();
190   if (j) *j = aijkok->j_device_data();
191   if (a) {
192     aijkok->a_dual.sync_device();
193     *a = aijkok->a_device_data();
194   }
195   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
196   PetscFunctionReturn(0);
197 }
198 
199 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
200 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
201 {
202   Mat_SeqAIJKokkos                            *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
203   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
204 
205   PetscFunctionBegin;
206   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k);
208   Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k);
209   PetscFunctionReturn(0);
210 }
211 
212 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
213 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
214 {
215   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
216 
217   PetscFunctionBegin;
218   if (aijkok && aijkok->device_mat_d.data()) {
219     *d_mat = aijkok->device_mat_d.data();
220   } else {
221     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
222     *d_mat = NULL;
223   }
224   PetscFunctionReturn(0);
225 }
226 
227 /* Generate the transpose on device and cache it internally */
228 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
229 {
230   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
231 
232   PetscFunctionBegin;
233   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
234   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
235     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
236     PetscCallCXX(aijkok->a_dual.sync_device());
237     PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat));
238     PetscCallCXX(sort_crs_matrix(aijkok->csrmatT));
239     aijkok->transpose_updated = PETSC_TRUE;
240   }
241   *csrmatT = &aijkok->csrmatT;
242   PetscFunctionReturn(0);
243 }
244 
245 /* Generate the Hermitian on device and cache it internally */
246 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
247 {
248   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
249 
250   PetscFunctionBegin;
251   PetscCall(PetscLogGpuTimeBegin());
252   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
253   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
254     PetscCallCXX(aijkok->a_dual.sync_device());
255     PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat));
256     PetscCallCXX(sort_crs_matrix(aijkok->csrmatH));
257 #if defined(PETSC_USE_COMPLEX)
258     const auto &a = aijkok->csrmatH.values;
259     Kokkos::parallel_for(
260       a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); });
261 #endif
262     aijkok->hermitian_updated = PETSC_TRUE;
263   }
264   *csrmatH = &aijkok->csrmatH;
265   PetscCall(PetscLogGpuTimeEnd());
266   PetscFunctionReturn(0);
267 }
268 
269 /* y = A x */
270 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
271 {
272   Mat_SeqAIJKokkos          *aijkok;
273   ConstPetscScalarKokkosView xv;
274   PetscScalarKokkosView      yv;
275 
276   PetscFunctionBegin;
277   PetscCall(PetscLogGpuTimeBegin());
278   PetscCall(MatSeqAIJKokkosSyncDevice(A));
279   PetscCall(VecGetKokkosView(xx, &xv));
280   PetscCall(VecGetKokkosViewWrite(yy, &yv));
281   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
282   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
283   PetscCall(VecRestoreKokkosView(xx, &xv));
284   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
285   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
286   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
287   PetscCall(PetscLogGpuTimeEnd());
288   PetscFunctionReturn(0);
289 }
290 
291 /* y = A^T x */
292 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
293 {
294   Mat_SeqAIJKokkos          *aijkok;
295   const char                *mode;
296   ConstPetscScalarKokkosView xv;
297   PetscScalarKokkosView      yv;
298   KokkosCsrMatrix           *csrmat;
299 
300   PetscFunctionBegin;
301   PetscCall(PetscLogGpuTimeBegin());
302   PetscCall(MatSeqAIJKokkosSyncDevice(A));
303   PetscCall(VecGetKokkosView(xx, &xv));
304   PetscCall(VecGetKokkosViewWrite(yy, &yv));
305   if (A->form_explicit_transpose) {
306     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
307     mode = "N";
308   } else {
309     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
310     csrmat = &aijkok->csrmat;
311     mode   = "T";
312   }
313   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
314   PetscCall(VecRestoreKokkosView(xx, &xv));
315   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
316   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
317   PetscCall(PetscLogGpuTimeEnd());
318   PetscFunctionReturn(0);
319 }
320 
321 /* y = A^H x */
322 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
323 {
324   Mat_SeqAIJKokkos          *aijkok;
325   const char                *mode;
326   ConstPetscScalarKokkosView xv;
327   PetscScalarKokkosView      yv;
328   KokkosCsrMatrix           *csrmat;
329 
330   PetscFunctionBegin;
331   PetscCall(PetscLogGpuTimeBegin());
332   PetscCall(MatSeqAIJKokkosSyncDevice(A));
333   PetscCall(VecGetKokkosView(xx, &xv));
334   PetscCall(VecGetKokkosViewWrite(yy, &yv));
335   if (A->form_explicit_transpose) {
336     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
337     mode = "N";
338   } else {
339     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
340     csrmat = &aijkok->csrmat;
341     mode   = "C";
342   }
343   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
344   PetscCall(VecRestoreKokkosView(xx, &xv));
345   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
346   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
347   PetscCall(PetscLogGpuTimeEnd());
348   PetscFunctionReturn(0);
349 }
350 
351 /* z = A x + y */
352 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
353 {
354   Mat_SeqAIJKokkos          *aijkok;
355   ConstPetscScalarKokkosView xv, yv;
356   PetscScalarKokkosView      zv;
357 
358   PetscFunctionBegin;
359   PetscCall(PetscLogGpuTimeBegin());
360   PetscCall(MatSeqAIJKokkosSyncDevice(A));
361   PetscCall(VecGetKokkosView(xx, &xv));
362   PetscCall(VecGetKokkosView(yy, &yv));
363   PetscCall(VecGetKokkosViewWrite(zz, &zv));
364   if (zz != yy) Kokkos::deep_copy(zv, yv);
365   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
366   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
367   PetscCall(VecRestoreKokkosView(xx, &xv));
368   PetscCall(VecRestoreKokkosView(yy, &yv));
369   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
370   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
371   PetscCall(PetscLogGpuTimeEnd());
372   PetscFunctionReturn(0);
373 }
374 
375 /* z = A^T x + y */
376 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
377 {
378   Mat_SeqAIJKokkos          *aijkok;
379   const char                *mode;
380   ConstPetscScalarKokkosView xv, yv;
381   PetscScalarKokkosView      zv;
382   KokkosCsrMatrix           *csrmat;
383 
384   PetscFunctionBegin;
385   PetscCall(PetscLogGpuTimeBegin());
386   PetscCall(MatSeqAIJKokkosSyncDevice(A));
387   PetscCall(VecGetKokkosView(xx, &xv));
388   PetscCall(VecGetKokkosView(yy, &yv));
389   PetscCall(VecGetKokkosViewWrite(zz, &zv));
390   if (zz != yy) Kokkos::deep_copy(zv, yv);
391   if (A->form_explicit_transpose) {
392     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
393     mode = "N";
394   } else {
395     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
396     csrmat = &aijkok->csrmat;
397     mode   = "T";
398   }
399   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
400   PetscCall(VecRestoreKokkosView(xx, &xv));
401   PetscCall(VecRestoreKokkosView(yy, &yv));
402   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
403   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
404   PetscCall(PetscLogGpuTimeEnd());
405   PetscFunctionReturn(0);
406 }
407 
408 /* z = A^H x + y */
409 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
410 {
411   Mat_SeqAIJKokkos          *aijkok;
412   const char                *mode;
413   ConstPetscScalarKokkosView xv, yv;
414   PetscScalarKokkosView      zv;
415   KokkosCsrMatrix           *csrmat;
416 
417   PetscFunctionBegin;
418   PetscCall(PetscLogGpuTimeBegin());
419   PetscCall(MatSeqAIJKokkosSyncDevice(A));
420   PetscCall(VecGetKokkosView(xx, &xv));
421   PetscCall(VecGetKokkosView(yy, &yv));
422   PetscCall(VecGetKokkosViewWrite(zz, &zv));
423   if (zz != yy) Kokkos::deep_copy(zv, yv);
424   if (A->form_explicit_transpose) {
425     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
426     mode = "N";
427   } else {
428     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
429     csrmat = &aijkok->csrmat;
430     mode   = "C";
431   }
432   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
433   PetscCall(VecRestoreKokkosView(xx, &xv));
434   PetscCall(VecRestoreKokkosView(yy, &yv));
435   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
436   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
437   PetscCall(PetscLogGpuTimeEnd());
438   PetscFunctionReturn(0);
439 }
440 
441 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
442 {
443   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
444 
445   PetscFunctionBegin;
446   switch (op) {
447   case MAT_FORM_EXPLICIT_TRANSPOSE:
448     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
449     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
450     A->form_explicit_transpose = flg;
451     break;
452   default:
453     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
454     break;
455   }
456   PetscFunctionReturn(0);
457 }
458 
459 /* Depending on reuse, either build a new mat, or use the existing mat */
460 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
461 {
462   Mat_SeqAIJ *aseq;
463 
464   PetscFunctionBegin;
465   PetscCall(PetscKokkosInitializeCheck());
466   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
467     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
468   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
469     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
470   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
471     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
472     PetscCall(PetscFree(A->defaultvectype));
473     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
474     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
475     PetscCall(MatSetOps_SeqAIJKokkos(A));
476     aseq = static_cast<Mat_SeqAIJ *>(A->data);
477     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
478       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
479       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
480     }
481   }
482   PetscFunctionReturn(0);
483 }
484 
485 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
486    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
487  */
488 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
489 {
490   Mat_SeqAIJ       *bseq;
491   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
492   Mat               mat;
493 
494   PetscFunctionBegin;
495   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
496   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
497   mat = *B;
498   if (A->assembled) {
499     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
500     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
501     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
502     /* Now copy values to B if needed */
503     if (dupOption == MAT_COPY_VALUES) {
504       if (akok->a_dual.need_sync_device()) {
505         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
506         bkok->a_dual.modify_host();
507       } else { /* If device has the latest data, we only copy data on device */
508         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
509         bkok->a_dual.modify_device();
510       }
511     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
512       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
513       bkok->a_dual.modify_host();
514     }
515     mat->spptr = bkok;
516   }
517 
518   PetscCall(PetscFree(mat->defaultvectype));
519   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
520   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
521   PetscCall(MatSetOps_SeqAIJKokkos(mat));
522   PetscFunctionReturn(0);
523 }
524 
525 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
526 {
527   Mat               At;
528   KokkosCsrMatrix  *internT;
529   Mat_SeqAIJKokkos *atkok, *bkok;
530 
531   PetscFunctionBegin;
532   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
533   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
534   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
535     /* Deep copy internT, as we want to isolate the internal transpose */
536     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT)));
537     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
538     if (reuse == MAT_INITIAL_MATRIX) *B = At;
539     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
540   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
541     if ((*B)->assembled) {
542       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
543       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values));
544       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
545     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
546       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
547       MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */
548       MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz());
549       PetscCallCXX(Kokkos::deep_copy(a_h, internT->values));
550       PetscCallCXX(Kokkos::deep_copy(j_h, internT->graph.entries));
551     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
552   }
553   PetscFunctionReturn(0);
554 }
555 
556 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
557 {
558   Mat_SeqAIJKokkos *aijkok;
559 
560   PetscFunctionBegin;
561   if (A->factortype == MAT_FACTOR_NONE) {
562     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
563     delete aijkok;
564   } else {
565     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
566   }
567   A->spptr = NULL;
568   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
569   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
570   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
571   PetscCall(MatDestroy_SeqAIJ(A));
572   PetscFunctionReturn(0);
573 }
574 
575 /*MC
576    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
577 
578    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
579 
580    Options Database Keys:
581 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
582 
583   Level: beginner
584 
585 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
586 M*/
587 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
588 {
589   PetscFunctionBegin;
590   PetscCall(PetscKokkosInitializeCheck());
591   PetscCall(MatCreate_SeqAIJ(A));
592   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
593   PetscFunctionReturn(0);
594 }
595 
596 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
597 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
598 {
599   Mat_SeqAIJ         *a, *b;
600   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
601   MatScalarKokkosView aa, ba, ca;
602   MatRowMapKokkosView ai, bi, ci;
603   MatColIdxKokkosView aj, bj, cj;
604   PetscInt            m, n, nnz, aN;
605 
606   PetscFunctionBegin;
607   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
608   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
609   PetscValidPointer(C, 4);
610   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
611   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
612   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
613   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
614 
615   PetscCall(MatSeqAIJKokkosSyncDevice(A));
616   PetscCall(MatSeqAIJKokkosSyncDevice(B));
617   a    = static_cast<Mat_SeqAIJ *>(A->data);
618   b    = static_cast<Mat_SeqAIJ *>(B->data);
619   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
620   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
621   aa   = akok->a_dual.view_device();
622   ai   = akok->i_dual.view_device();
623   ba   = bkok->a_dual.view_device();
624   bi   = bkok->i_dual.view_device();
625   m    = A->rmap->n; /* M, N and nnz of C */
626   n    = A->cmap->n + B->cmap->n;
627   nnz  = a->nz + b->nz;
628   aN   = A->cmap->n; /* N of A */
629   if (reuse == MAT_INITIAL_MATRIX) {
630     aj           = akok->j_dual.view_device();
631     bj           = bkok->j_dual.view_device();
632     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
633     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
634     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
635     ca           = ca_dual.view_device();
636     ci           = ci_dual.view_device();
637     cj           = cj_dual.view_device();
638 
639     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
640     Kokkos::parallel_for(
641       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
642         PetscInt i       = t.league_rank(); /* row i */
643         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
644 
645         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
646                                                    ci(i) = coffset;
647                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
648         });
649 
650         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
651           if (k < alen) {
652             ca(coffset + k) = aa(ai(i) + k);
653             cj(coffset + k) = aj(ai(i) + k);
654           } else {
655             ca(coffset + k) = ba(bi(i) + k - alen);
656             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
657           }
658         });
659       });
660     ca_dual.modify_device();
661     ci_dual.modify_device();
662     cj_dual.modify_device();
663     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
664     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
665   } else if (reuse == MAT_REUSE_MATRIX) {
666     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
667     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
668     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
669     ca   = ckok->a_dual.view_device();
670     ci   = ckok->i_dual.view_device();
671 
672     Kokkos::parallel_for(
673       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
674         PetscInt i    = t.league_rank(); /* row i */
675         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
676         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
677           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
678           else ca(ci(i) + k) = ba(bi(i) + k - alen);
679         });
680       });
681     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
682   }
683   PetscFunctionReturn(0);
684 }
685 
686 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
687 {
688   PetscFunctionBegin;
689   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
690   PetscFunctionReturn(0);
691 }
692 
693 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
694 {
695   Mat_Product                 *product = C->product;
696   Mat                          A, B;
697   bool                         transA, transB; /* use bool, since KK needs this type */
698   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
699   Mat_SeqAIJ                  *c;
700   MatProductData_SeqAIJKokkos *pdata;
701   KokkosCsrMatrix             *csrmatA, *csrmatB;
702 
703   PetscFunctionBegin;
704   MatCheckProduct(C, 1);
705   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
706   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
707 
708   if (pdata->reusesym) {           /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
709     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
710     PetscFunctionReturn(0);
711   }
712 
713   switch (product->type) {
714   case MATPRODUCT_AB:
715     transA = false;
716     transB = false;
717     break;
718   case MATPRODUCT_AtB:
719     transA = true;
720     transB = false;
721     break;
722   case MATPRODUCT_ABt:
723     transA = false;
724     transB = true;
725     break;
726   default:
727     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
728   }
729 
730   A = product->A;
731   B = product->B;
732   PetscCall(MatSeqAIJKokkosSyncDevice(A));
733   PetscCall(MatSeqAIJKokkosSyncDevice(B));
734   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
735   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
736   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
737 
738   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
739 
740   csrmatA = &akok->csrmat;
741   csrmatB = &bkok->csrmat;
742 
743   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
744   if (transA) {
745     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
746     transA = false;
747   }
748 
749   if (transB) {
750     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
751     transB = false;
752   }
753   PetscCall(PetscLogGpuTimeBegin());
754   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat));
755   auto spgemmHandle = pdata->kh.get_spgemm_handle();
756   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
757 
758   PetscCall(PetscLogGpuTimeEnd());
759   PetscCall(MatSeqAIJKokkosModifyDevice(C));
760   /* shorter version of MatAssemblyEnd_SeqAIJ */
761   c = (Mat_SeqAIJ *)C->data;
762   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
763   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
764   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
765   c->reallocs         = 0;
766   C->info.mallocs     = 0;
767   C->info.nz_unneeded = 0;
768   C->assembled = C->was_assembled = PETSC_TRUE;
769   C->num_ass++;
770   PetscFunctionReturn(0);
771 }
772 
773 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
774 {
775   Mat_Product                 *product = C->product;
776   MatProductType               ptype;
777   Mat                          A, B;
778   bool                         transA, transB;
779   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
780   MatProductData_SeqAIJKokkos *pdata;
781   MPI_Comm                     comm;
782   KokkosCsrMatrix             *csrmatA, *csrmatB, csrmatC;
783 
784   PetscFunctionBegin;
785   MatCheckProduct(C, 1);
786   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
787   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
788   A = product->A;
789   B = product->B;
790   PetscCall(MatSeqAIJKokkosSyncDevice(A));
791   PetscCall(MatSeqAIJKokkosSyncDevice(B));
792   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
793   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
794   csrmatA = &akok->csrmat;
795   csrmatB = &bkok->csrmat;
796 
797   ptype = product->type;
798   switch (ptype) {
799   case MATPRODUCT_AB:
800     transA = false;
801     transB = false;
802     break;
803   case MATPRODUCT_AtB:
804     transA = true;
805     transB = false;
806     break;
807   case MATPRODUCT_ABt:
808     transA = false;
809     transB = true;
810     break;
811   default:
812     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
813   }
814 
815   product->data = pdata = new MatProductData_SeqAIJKokkos();
816   pdata->kh.set_team_work_size(16);
817   pdata->kh.set_dynamic_scheduling(true);
818   pdata->reusesym = product->api_user;
819 
820   /* TODO: add command line options to select spgemm algorithms */
821   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
822 
823   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
824 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
825   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
826   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
827   #endif
828 #endif
829 
830   pdata->kh.create_spgemm_handle(spgemm_alg);
831 
832   PetscCall(PetscLogGpuTimeBegin());
833   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
834   if (transA) {
835     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
836     transA = false;
837   }
838 
839   if (transB) {
840     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
841     transB = false;
842   }
843 
844   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC));
845 
846   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
847     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
848     calling new Mat_SeqAIJKokkos().
849     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
850   */
851   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC));
852   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
853   auto spgemmHandle = pdata->kh.get_spgemm_handle();
854   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
855   PetscCall(PetscLogGpuTimeEnd());
856 
857   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
858   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
859   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
860   PetscFunctionReturn(0);
861 }
862 
863 /* handles sparse matrix matrix ops */
864 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
865 {
866   Mat_Product *product = mat->product;
867   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
868 
869   PetscFunctionBegin;
870   MatCheckProduct(mat, 1);
871   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
872   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
873   if (Biskok && Ciskok) {
874     switch (product->type) {
875     case MATPRODUCT_AB:
876     case MATPRODUCT_AtB:
877     case MATPRODUCT_ABt:
878       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
879       break;
880     case MATPRODUCT_PtAP:
881     case MATPRODUCT_RARt:
882     case MATPRODUCT_ABC:
883       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
884       break;
885     default:
886       break;
887     }
888   } else { /* fallback for AIJ */
889     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
890   }
891   PetscFunctionReturn(0);
892 }
893 
894 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
895 {
896   Mat_SeqAIJKokkos *aijkok;
897 
898   PetscFunctionBegin;
899   PetscCall(PetscLogGpuTimeBegin());
900   PetscCall(MatSeqAIJKokkosSyncDevice(A));
901   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
902   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
903   PetscCall(MatSeqAIJKokkosModifyDevice(A));
904   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
905   PetscCall(PetscLogGpuTimeEnd());
906   PetscFunctionReturn(0);
907 }
908 
909 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
910 {
911   Mat_SeqAIJKokkos *aijkok;
912 
913   PetscFunctionBegin;
914   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
915   if (aijkok) { /* Only zero the device if data is already there */
916     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
917     PetscCall(MatSeqAIJKokkosModifyDevice(A));
918   } else { /* Might be preallocated but not assembled */
919     PetscCall(MatZeroEntries_SeqAIJ(A));
920   }
921   PetscFunctionReturn(0);
922 }
923 
924 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
925 {
926   Mat_SeqAIJ           *aijseq;
927   Mat_SeqAIJKokkos     *aijkok;
928   PetscInt              n;
929   PetscScalarKokkosView xv;
930 
931   PetscFunctionBegin;
932   PetscCall(VecGetLocalSize(x, &n));
933   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
934   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
935 
936   PetscCall(MatSeqAIJKokkosSyncDevice(A));
937   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
938 
939   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
940     PetscCall(MatMarkDiagonal_SeqAIJ(A));
941     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
942     aijkok->SetDiagonal(aijseq->diag);
943   }
944 
945   const auto &Aa    = aijkok->a_dual.view_device();
946   const auto &Ai    = aijkok->i_dual.view_device();
947   const auto &Adiag = aijkok->diag_dual.view_device();
948 
949   PetscCall(VecGetKokkosViewWrite(x, &xv));
950   Kokkos::parallel_for(
951     n, KOKKOS_LAMBDA(const PetscInt i) {
952       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
953       else xv(i) = 0;
954     });
955   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
956   PetscFunctionReturn(0);
957 }
958 
959 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
960 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
961 {
962   Mat_SeqAIJKokkos *aijkok;
963 
964   PetscFunctionBegin;
965   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
966   PetscValidPointer(kv, 2);
967   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
968   PetscCall(MatSeqAIJKokkosSyncDevice(A));
969   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
970   *kv    = aijkok->a_dual.view_device();
971   PetscFunctionReturn(0);
972 }
973 
974 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
975 {
976   PetscFunctionBegin;
977   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
978   PetscValidPointer(kv, 2);
979   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
980   PetscFunctionReturn(0);
981 }
982 
983 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
984 {
985   Mat_SeqAIJKokkos *aijkok;
986 
987   PetscFunctionBegin;
988   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
989   PetscValidPointer(kv, 2);
990   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
991   PetscCall(MatSeqAIJKokkosSyncDevice(A));
992   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
993   *kv    = aijkok->a_dual.view_device();
994   PetscFunctionReturn(0);
995 }
996 
997 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
998 {
999   PetscFunctionBegin;
1000   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1001   PetscValidPointer(kv, 2);
1002   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1003   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1004   PetscFunctionReturn(0);
1005 }
1006 
1007 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1008 {
1009   Mat_SeqAIJKokkos *aijkok;
1010 
1011   PetscFunctionBegin;
1012   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1013   PetscValidPointer(kv, 2);
1014   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1015   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1016   *kv    = aijkok->a_dual.view_device();
1017   PetscFunctionReturn(0);
1018 }
1019 
1020 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1021 {
1022   PetscFunctionBegin;
1023   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1024   PetscValidPointer(kv, 2);
1025   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1026   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1027   PetscFunctionReturn(0);
1028 }
1029 
1030 /* Computes Y += alpha X */
1031 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1032 {
1033   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1034   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1035   ConstMatScalarKokkosView Xa;
1036   MatScalarKokkosView      Ya;
1037 
1038   PetscFunctionBegin;
1039   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1040   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1041   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1042   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1043   PetscCall(PetscLogGpuTimeBegin());
1044 
1045   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1046     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
1047     PetscBool e;
1048     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1049     if (e) {
1050       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1051       if (e) pattern = SAME_NONZERO_PATTERN;
1052     }
1053   }
1054 
1055   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1056     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1057     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1058     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1059   */
1060   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1061   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1062   Xa   = xkok->a_dual.view_device();
1063   Ya   = ykok->a_dual.view_device();
1064 
1065   if (pattern == SAME_NONZERO_PATTERN) {
1066     KokkosBlas::axpy(alpha, Xa, Ya);
1067     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1068   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1069     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1070     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1071 
1072     Kokkos::parallel_for(
1073       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1074         PetscInt i = t.league_rank();              /* row i */
1075         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */
1076                                                    PetscInt p, q = Yi(i);
1077                                                    for (p = Xi(i); p < Xi(i + 1); p++) {          /* For each nonzero on row i of X */
1078                                                      while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */
1079                                                      if (Xj(p) == Yj(q)) {                        /* Found it */
1080                                                        Ya(q) += alpha * Xa(p);
1081                                                        q++;
1082                                                      } else {
1083                                                        /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1084                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1085             */
1086                                                        if (Yi(i) != Yi(i + 1))
1087                                                          Ya(Yi(i)) =
1088 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99)
1089                                                            Kokkos::nan("1"); /* auto promote the double NaN if needed */
1090 #else
1091               Kokkos::Experimental::nan("1");
1092 #endif
1093                                                      }
1094                                                    }
1095         });
1096       });
1097     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1098   } else { /* different nonzero patterns */
1099     Mat             Z;
1100     KokkosCsrMatrix zcsr;
1101     KernelHandle    kh;
1102     kh.create_spadd_handle(false);
1103     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1104     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1105     zkok = new Mat_SeqAIJKokkos(zcsr);
1106     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1107     PetscCall(MatHeaderReplace(Y, &Z));
1108     kh.destroy_spadd_handle();
1109   }
1110   PetscCall(PetscLogGpuTimeEnd());
1111   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); /* Because we scaled X and then added it to Y */
1112   PetscFunctionReturn(0);
1113 }
1114 
1115 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1116 {
1117   Mat_SeqAIJKokkos *akok;
1118   Mat_SeqAIJ       *aseq;
1119 
1120   PetscFunctionBegin;
1121   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1122   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1123   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1124   delete akok;
1125   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1126   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1127   akok->SetUpCOO(aseq);
1128   PetscFunctionReturn(0);
1129 }
1130 
1131 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1132 {
1133   Mat_SeqAIJ                 *aseq = static_cast<Mat_SeqAIJ *>(A->data);
1134   Mat_SeqAIJKokkos           *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1135   PetscCount                  Annz = aseq->nz;
1136   const PetscCountKokkosView &jmap = akok->jmap_d;
1137   const PetscCountKokkosView &perm = akok->perm_d;
1138   MatScalarKokkosView         Aa;
1139   ConstMatScalarKokkosView    kv;
1140   PetscMemType                memtype;
1141 
1142   PetscFunctionBegin;
1143   PetscCall(PetscGetMemType(v, &memtype));
1144   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1145     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
1146   } else {
1147     kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
1148   }
1149 
1150   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1151   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1152 
1153   Kokkos::parallel_for(
1154     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1155       PetscScalar sum = 0.0;
1156       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1157       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1158     });
1159 
1160   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1161   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1162   PetscFunctionReturn(0);
1163 }
1164 
1165 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag)
1166 {
1167   Mat_SeqAIJKokkos          *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1168   MatScalarKokkosView        Aa;
1169   const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
1170   PetscInt                   m  = A->rmap->n;
1171   ConstMatRowMapKokkosView   Adiag(diag, m); /* diag is a device pointer */
1172 
1173   PetscFunctionBegin;
1174   PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa));
1175   Kokkos::parallel_for(
1176     m, KOKKOS_LAMBDA(const PetscInt i) {
1177       PetscScalar tmp;
1178       if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
1179         tmp          = Aa(Ai(i));
1180         Aa(Ai(i))    = Aa(Adiag(i));
1181         Aa(Adiag(i)) = tmp;
1182       }
1183     });
1184   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1185   PetscFunctionReturn(0);
1186 }
1187 
1188 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1189 {
1190   PetscFunctionBegin;
1191   PetscCall(MatSeqAIJKokkosSyncHost(A));
1192   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1193   B->offloadmask = PETSC_OFFLOAD_CPU;
1194   PetscFunctionReturn(0);
1195 }
1196 
1197 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1198 {
1199   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1200 
1201   PetscFunctionBegin;
1202   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1203   A->boundtocpu  = PETSC_FALSE;
1204 
1205   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1206   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1207   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1208   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1209   A->ops->scale                     = MatScale_SeqAIJKokkos;
1210   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1211   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1212   A->ops->mult                      = MatMult_SeqAIJKokkos;
1213   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1214   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1215   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1216   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1217   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1218   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1219   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1220   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1221   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1222   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1223   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1224   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1225   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1226   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1227   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1228   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1229 
1230   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1231   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1232   PetscFunctionReturn(0);
1233 }
1234 
1235 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1236 {
1237   Mat_SeqAIJ *aseq;
1238   PetscInt    i, m, n;
1239 
1240   PetscFunctionBegin;
1241   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1242 
1243   m = akok->nrows();
1244   n = akok->ncols();
1245   PetscCall(MatSetSizes(A, m, n, m, n));
1246   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1247 
1248   /* Set up data structures of A as a MATSEQAIJ */
1249   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1250   aseq = (Mat_SeqAIJ *)(A)->data;
1251 
1252   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1253   akok->j_dual.sync_host();
1254 
1255   aseq->i            = akok->i_host_data();
1256   aseq->j            = akok->j_host_data();
1257   aseq->a            = akok->a_host_data();
1258   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1259   aseq->singlemalloc = PETSC_FALSE;
1260   aseq->free_a       = PETSC_FALSE;
1261   aseq->free_ij      = PETSC_FALSE;
1262   aseq->nz           = akok->nnz();
1263   aseq->maxnz        = aseq->nz;
1264 
1265   PetscCall(PetscMalloc1(m, &aseq->imax));
1266   PetscCall(PetscMalloc1(m, &aseq->ilen));
1267   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1268 
1269   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1270   akok->nonzerostate = A->nonzerostate;
1271   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1272   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1273   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1274   PetscFunctionReturn(0);
1275 }
1276 
1277 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1278 
1279    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1280  */
1281 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1282 {
1283   PetscFunctionBegin;
1284   PetscCall(MatCreate(comm, A));
1285   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1286   PetscFunctionReturn(0);
1287 }
1288 
1289 /* --------------------------------------------------------------------------------*/
1290 /*@C
1291    MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1292    (the default parallel PETSc format). This matrix will ultimately be handled by
1293    Kokkos for calculations. For good matrix
1294    assembly performance the user should preallocate the matrix storage by setting
1295    the parameter nz (or the array nnz).  By setting these parameters accurately,
1296    performance during matrix assembly can be increased by more than a factor of 50.
1297 
1298    Collective
1299 
1300    Input Parameters:
1301 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
1302 .  m - number of rows
1303 .  n - number of columns
1304 .  nz - number of nonzeros per row (same for all rows)
1305 -  nnz - array containing the number of nonzeros in the various rows
1306          (possibly different for each row) or NULL
1307 
1308    Output Parameter:
1309 .  A - the matrix
1310 
1311    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1312    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1313    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1314 
1315    Notes:
1316    If nnz is given then nz is ignored
1317 
1318    The AIJ format, also called
1319    compressed row storage, is fully compatible with standard Fortran 77
1320    storage.  That is, the stored row and column indices can begin at
1321    either one (as in Fortran) or zero.  See the users' manual for details.
1322 
1323    Specify the preallocated storage with either nz or nnz (not both).
1324    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
1325    allocation.  For large problems you MUST preallocate memory or you
1326    will get TERRIBLE performance, see the users' manual chapter on matrices.
1327 
1328    By default, this format uses inodes (identical nodes) when possible, to
1329    improve numerical efficiency of matrix-vector products and solves. We
1330    search for consecutive rows with the same nonzero structure, thereby
1331    reusing matrix information to achieve increased efficiency.
1332 
1333    Level: intermediate
1334 
1335 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1336 @*/
1337 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1338 {
1339   PetscFunctionBegin;
1340   PetscCall(PetscKokkosInitializeCheck());
1341   PetscCall(MatCreate(comm, A));
1342   PetscCall(MatSetSizes(*A, m, n, m, n));
1343   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1344   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1345   PetscFunctionReturn(0);
1346 }
1347 
1348 typedef Kokkos::TeamPolicy<>::member_type team_member;
1349 //
1350 // This factorization exploits block diagonal matrices with "Nf" (not used).
1351 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1352 //
1353 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info)
1354 {
1355   Mat_SeqAIJ       *b      = (Mat_SeqAIJ *)B->data;
1356   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1357   IS                isrow = b->row, isicol = b->icol;
1358   const PetscInt   *r_h, *ic_h;
1359   const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1360   const PetscScalar *aa_d = aijkok->a_dual.view_device().data();
1361   PetscScalar       *ba_d = baijkok->a_dual.view_device().data();
1362   PetscBool          row_identity, col_identity;
1363   PetscInt           nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used
1364 
1365   PetscFunctionBegin;
1366   PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n);
1367   PetscCall(MatIsStructurallySymmetric(A, &row_identity));
1368   PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported");
1369   PetscCall(ISGetIndices(isrow, &r_h));
1370   PetscCall(ISGetIndices(isicol, &ic_h));
1371   PetscCall(ISGetSize(isicol, &nc));
1372   PetscCall(PetscLogGpuTimeBegin());
1373   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1374   {
1375 #define KOKKOS_SHARED_LEVEL 1
1376     using scr_mem_t    = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1377     using sizet_scr_t  = Kokkos::View<size_t, scr_mem_t>;
1378     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1379     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n);
1380     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_r_k("r", n);
1381     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc);
1382     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_ic_k("ic", nc);
1383     size_t                                                                                                               flops_h = 0.0;
1384     Kokkos::View<size_t, Kokkos::HostSpace>                                                                              h_flops_k(&flops_h);
1385     Kokkos::View<size_t>                                                                                                 d_flops_k("flops");
1386     const int                                                                                                            conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1387     const int                                                                                                            nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1388     Kokkos::deep_copy(d_flops_k, h_flops_k);
1389     Kokkos::deep_copy(d_r_k, h_r_k);
1390     Kokkos::deep_copy(d_ic_k, h_ic_k);
1391     // Fill A --> fact
1392     Kokkos::parallel_for(
1393       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) {
1394         const PetscInt  field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA
1395         const PetscInt  nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
1396         const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data();
1397         // zero rows of B
1398         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1399           PetscInt     nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag
1400           PetscScalar *baL = ba_d + bi_d[rowb];
1401           PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1;
1402           /* zero (unfactored row) */
1403           for (int j = 0; j < nzbL; j++) baL[j] = 0;
1404           for (int j = 0; j < nzbU; j++) baU[j] = 0;
1405         });
1406         // copy A into B
1407         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1408           PetscInt           rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
1409           const PetscScalar *av    = aa_d + ai_d[rowa];
1410           const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1411           /* load in initial (unfactored row) */
1412           for (int j = 0; j < nza; j++) {
1413             PetscInt    colb = ic[ajtmp[j]];
1414             PetscScalar vala = av[j];
1415             if (colb == rowb) {
1416               *(ba_d + bdiag_d[rowb]) = vala;
1417             } else {
1418               const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1419               PetscScalar    *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1420               PetscInt        nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0;
1421               for (int j = 0; j < nz; j++) {
1422                 if (pbj[j] == colb) {
1423                   pba[j] = vala;
1424                   set++;
1425                   break;
1426                 }
1427               }
1428 #if !defined(PETSC_HAVE_SYCL)
1429               if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1430 #endif
1431             }
1432           }
1433         });
1434       });
1435     Kokkos::fence();
1436 
1437     Kokkos::parallel_for(
1438       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) {
1439         sizet_scr_t    colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1440         scalar_scr_t   L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1441         sizet_scr_t    flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1442         const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA
1443         const PetscInt start = field * nloc, end = start + nloc;
1444         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1445         // A22 panel update for each row A(1,:) and col A(:,1)
1446         for (int ii = start; ii < end - 1; ii++) {
1447           const PetscInt    *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end)
1448           const PetscScalar *baUi    = ba_d + bdiag_d[ii + 1] + 1;                                          // vector of data  U(i,i+1:end)
1449           const PetscInt     nUi_its = nzUi / Ni + !!(nzUi % Ni);
1450           const PetscScalar  Bii     = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1451           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) {
1452             PetscInt kIdx = j * Ni + field_block_idx;
1453             if (kIdx >= nzUi) /* void */
1454               ;
1455             else {
1456               const PetscInt  myk = bjUi[kIdx];                // assume symmetric structure, need a transposed meta-data here in general
1457               const PetscInt *pjL = bj_d + bi_d[myk];          // look for L(myk,ii) in start of row
1458               const PetscInt  nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:)
1459               size_t          st_idx;
1460               // find and do L(k,i) = A(:k,i) / A(i,i)
1461               Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1462               // get column, there has got to be a better way
1463               Kokkos::parallel_reduce(
1464                 Kokkos::ThreadVectorRange(team, nzL),
1465                 [&](const int &j, size_t &idx) {
1466                   if (pjL[j] == ii) {
1467                     PetscScalar *pLki = ba_d + bi_d[myk] + j;
1468                     idx               = j;           // output
1469                     *pLki             = *pLki / Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1470                   }
1471                 },
1472                 st_idx);
1473               Kokkos::single(Kokkos::PerThread(team), [=]() {
1474                 colkIdx() = st_idx;
1475                 L_ki()    = *(ba_d + bi_d[myk] + st_idx);
1476               });
1477 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1478               if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register
1479 #endif
1480               // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1481               // U(i+1,:end)
1482               Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U)
1483                 PetscScalar Uij = baUi[uiIdx];
1484                 PetscInt    col = bjUi[uiIdx];
1485                 if (col == myk) {
1486                   // A_kk = A_kk - L_ki * U_ij(k)
1487                   PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1488                   *Akkv             = *Akkv - L_ki() * Uij;  // UiK
1489                 } else {
1490                   PetscScalar    *start, *end, *pAkjv = NULL;
1491                   PetscInt        high, low;
1492                   const PetscInt *startj;
1493                   if (col < myk) { // L
1494                     PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1495                     PetscInt     idx  = (pLki + 1) - (ba_d + bi_d[myk]); // index into row
1496                     start             = pLki + 1;                        // start at pLki+1, A22(myk,1)
1497                     startj            = bj_d + bi_d[myk] + idx;
1498                     end               = ba_d + bi_d[myk + 1];
1499                   } else {
1500                     PetscInt idx = bdiag_d[myk + 1] + 1;
1501                     start        = ba_d + idx;
1502                     startj       = bj_d + idx;
1503                     end          = ba_d + bdiag_d[myk];
1504                   }
1505                   // search for 'col', use bisection search - TODO
1506                   low  = 0;
1507                   high = (PetscInt)(end - start);
1508                   while (high - low > 5) {
1509                     int t = (low + high) / 2;
1510                     if (startj[t] > col) high = t;
1511                     else low = t;
1512                   }
1513                   for (pAkjv = start + low; pAkjv < start + high; pAkjv++) {
1514                     if (startj[pAkjv - start] == col) break;
1515                   }
1516 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1517                   if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register
1518 #endif
1519                   *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1520                 }
1521               });
1522             }
1523           });
1524           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1525           if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); });
1526         } /* endof for (i=0; i<n; i++) { */
1527         Kokkos::single(Kokkos::PerTeam(team), [=]() {
1528           Kokkos::atomic_add(&d_flops_k(), flops());
1529           flops() = 0;
1530         });
1531       });
1532     Kokkos::fence();
1533     Kokkos::deep_copy(h_flops_k, d_flops_k);
1534     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1535     Kokkos::parallel_for(
1536       Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) {
1537         const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni;                            //, field_offset = lg_rank%Ni;
1538         const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters
1539         /* Invert diagonal for simpler triangular solves */
1540         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) {
1541           int i = start + outer_index * Ni + lg_rank % Ni;
1542           if (i < end) {
1543             PetscScalar *pv = ba_d + bdiag_d[i];
1544             *pv             = 1.0 / (*pv);
1545           }
1546         });
1547       });
1548   }
1549   PetscCall(PetscLogGpuTimeEnd());
1550   PetscCall(ISRestoreIndices(isicol, &ic_h));
1551   PetscCall(ISRestoreIndices(isrow, &r_h));
1552 
1553   PetscCall(ISIdentity(isrow, &row_identity));
1554   PetscCall(ISIdentity(isicol, &col_identity));
1555   if (b->inode.size) {
1556     B->ops->solve = MatSolve_SeqAIJ_Inode;
1557   } else if (row_identity && col_identity) {
1558     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1559   } else {
1560     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1561   }
1562   B->offloadmask = PETSC_OFFLOAD_GPU;
1563   PetscCall(MatSeqAIJKokkosSyncHost(B));          // solve on CPU
1564   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1565   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1566   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1567   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1568   B->assembled              = PETSC_TRUE;
1569   B->preallocated           = PETSC_TRUE;
1570 
1571   PetscFunctionReturn(0);
1572 }
1573 
1574 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1575 {
1576   PetscFunctionBegin;
1577   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1578   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1579   PetscFunctionReturn(0);
1580 }
1581 
1582 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1583 {
1584   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1585 
1586   PetscFunctionBegin;
1587   if (!factors->sptrsv_symbolic_completed) {
1588     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1589     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1590     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1591   }
1592   PetscFunctionReturn(0);
1593 }
1594 
1595 /* Check if we need to update factors etc for transpose solve */
1596 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1597 {
1598   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1599   MatColIdxType               n       = A->rmap->n;
1600 
1601   PetscFunctionBegin;
1602   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1603     /* Update L^T and do sptrsv symbolic */
1604     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1);
1605     Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */
1606     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0));
1607     factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0));
1608 
1609     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1610                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1611 
1612     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1613       We have to sort the indices, until KK provides finer control options.
1614     */
1615     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1616 
1617     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1618 
1619     /* Update U^T and do sptrsv symbolic */
1620     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1);
1621     Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */
1622     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0));
1623     factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0));
1624 
1625     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1626                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1627 
1628     /* Sort indices. See comments above */
1629     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1630 
1631     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1632     factors->transpose_updated = PETSC_TRUE;
1633   }
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 /* Solve Ax = b, with A = LU */
1638 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1639 {
1640   ConstPetscScalarKokkosView  bv;
1641   PetscScalarKokkosView       xv;
1642   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1643 
1644   PetscFunctionBegin;
1645   PetscCall(PetscLogGpuTimeBegin());
1646   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1647   PetscCall(VecGetKokkosView(b, &bv));
1648   PetscCall(VecGetKokkosViewWrite(x, &xv));
1649   /* Solve L tmpv = b */
1650   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1651   /* Solve Ux = tmpv */
1652   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1653   PetscCall(VecRestoreKokkosView(b, &bv));
1654   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1655   PetscCall(PetscLogGpuTimeEnd());
1656   PetscFunctionReturn(0);
1657 }
1658 
1659 /* Solve A^T x = b, where A^T = U^T L^T */
1660 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1661 {
1662   ConstPetscScalarKokkosView  bv;
1663   PetscScalarKokkosView       xv;
1664   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1665 
1666   PetscFunctionBegin;
1667   PetscCall(PetscLogGpuTimeBegin());
1668   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1669   PetscCall(VecGetKokkosView(b, &bv));
1670   PetscCall(VecGetKokkosViewWrite(x, &xv));
1671   /* Solve U^T tmpv = b */
1672   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1673 
1674   /* Solve L^T x = tmpv */
1675   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1676   PetscCall(VecRestoreKokkosView(b, &bv));
1677   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1678   PetscCall(PetscLogGpuTimeEnd());
1679   PetscFunctionReturn(0);
1680 }
1681 
1682 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1683 {
1684   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1685   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1686   PetscInt                    fill_lev = info->levels;
1687 
1688   PetscFunctionBegin;
1689   PetscCall(PetscLogGpuTimeBegin());
1690   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1691 
1692   auto a_d = aijkok->a_dual.view_device();
1693   auto i_d = aijkok->i_dual.view_device();
1694   auto j_d = aijkok->j_dual.view_device();
1695 
1696   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1697 
1698   B->assembled              = PETSC_TRUE;
1699   B->preallocated           = PETSC_TRUE;
1700   B->ops->solve             = MatSolve_SeqAIJKokkos;
1701   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1702   B->ops->matsolve          = NULL;
1703   B->ops->matsolvetranspose = NULL;
1704   B->offloadmask            = PETSC_OFFLOAD_GPU;
1705 
1706   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1707   factors->transpose_updated         = PETSC_FALSE;
1708   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1709   /* TODO: log flops, but how to know that? */
1710   PetscCall(PetscLogGpuTimeEnd());
1711   PetscFunctionReturn(0);
1712 }
1713 
1714 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1715 {
1716   Mat_SeqAIJKokkos           *aijkok;
1717   Mat_SeqAIJ                 *b;
1718   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1719   PetscInt                    fill_lev = info->levels;
1720   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1721   PetscInt                    n        = A->rmap->n;
1722 
1723   PetscFunctionBegin;
1724   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1725   /* Rebuild factors */
1726   if (factors) {
1727     factors->Destroy();
1728   } /* Destroy the old if it exists */
1729   else {
1730     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1731   }
1732 
1733   /* Create a spiluk handle and then do symbolic factorization */
1734   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1735   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1736 
1737   auto spiluk_handle = factors->kh.get_spiluk_handle();
1738 
1739   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1740   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1741   Kokkos::realloc(factors->iU_d, n + 1);
1742   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1743 
1744   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1745   auto i_d = aijkok->i_dual.view_device();
1746   auto j_d = aijkok->j_dual.view_device();
1747   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1748   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1749 
1750   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1751   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1752   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1753   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1754 
1755   /* TODO: add options to select sptrsv algorithms */
1756   /* Create sptrsv handles for L, U and their transpose */
1757 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1758   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1759 #else
1760   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1761 #endif
1762 
1763   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1764   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1765   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1766   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1767 
1768   /* Fill fields of the factor matrix B */
1769   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1770   b     = (Mat_SeqAIJ *)B->data;
1771   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1772   B->info.fill_ratio_given  = info->fill;
1773   B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA);
1774 
1775   B->offloadmask          = PETSC_OFFLOAD_GPU;
1776   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1777   PetscFunctionReturn(0);
1778 }
1779 
1780 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1781 {
1782   Mat_SeqAIJ    *b     = (Mat_SeqAIJ *)B->data;
1783   const PetscInt nrows = A->rmap->n;
1784 
1785   PetscFunctionBegin;
1786   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1787   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1788   // move B data into Kokkos
1789   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1790   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1791   {
1792     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1793     if (!baijkok->diag_d.extent(0)) {
1794       const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1);
1795       baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag));
1796       Kokkos::deep_copy(baijkok->diag_d, h_diag);
1797     }
1798   }
1799   PetscFunctionReturn(0);
1800 }
1801 
1802 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1803 {
1804   PetscFunctionBegin;
1805   *type = MATSOLVERKOKKOS;
1806   PetscFunctionReturn(0);
1807 }
1808 
1809 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type)
1810 {
1811   PetscFunctionBegin;
1812   *type = MATSOLVERKOKKOSDEVICE;
1813   PetscFunctionReturn(0);
1814 }
1815 
1816 /*MC
1817   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1818   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1819 
1820   Level: beginner
1821 
1822 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1823 M*/
1824 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1825 {
1826   PetscInt n = A->rmap->n;
1827 
1828   PetscFunctionBegin;
1829   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1830   PetscCall(MatSetSizes(*B, n, n, n, n));
1831   (*B)->factortype = ftype;
1832   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1833   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1834 
1835   if (ftype == MAT_FACTOR_LU) {
1836     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1837     (*B)->canuseordering        = PETSC_TRUE;
1838     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1839   } else if (ftype == MAT_FACTOR_ILU) {
1840     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1841     (*B)->canuseordering         = PETSC_FALSE;
1842     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1843   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1844 
1845   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1846   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1847   PetscFunctionReturn(0);
1848 }
1849 
1850 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B)
1851 {
1852   PetscInt n = A->rmap->n;
1853 
1854   PetscFunctionBegin;
1855   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1856   PetscCall(MatSetSizes(*B, n, n, n, n));
1857   (*B)->factortype     = ftype;
1858   (*B)->canuseordering = PETSC_TRUE;
1859   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1860   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1861 
1862   if (ftype == MAT_FACTOR_LU) {
1863     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1864     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1865   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types");
1866 
1867   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1868   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device));
1869   PetscFunctionReturn(0);
1870 }
1871 
1872 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1873 {
1874   PetscFunctionBegin;
1875   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1876   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1877   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device));
1878   PetscFunctionReturn(0);
1879 }
1880 
1881 /* Utility to print out a KokkosCsrMatrix for debugging */
1882 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1883 {
1884   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1885   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1886   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1887   const PetscInt    *i  = iv.data();
1888   const PetscInt    *j  = jv.data();
1889   const PetscScalar *a  = av.data();
1890   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1891 
1892   PetscFunctionBegin;
1893   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1894   for (PetscInt k = 0; k < m; k++) {
1895     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1896     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1897     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1898   }
1899   PetscFunctionReturn(0);
1900 }
1901