xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision d2522c19e8fa9bca20aaca277941d9a63e71db6a)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
18 
19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99)
20 #include <KokkosSparse_Utils.hpp>
21 using KokkosSparse::sort_crs_matrix;
22 using KokkosSparse::Impl::transpose_matrix;
23 #else
24 #include <KokkosKernels_Sorting.hpp>
25 using KokkosKernels::sort_crs_matrix;
26 using KokkosKernels::Impl::transpose_matrix;
27 #endif
28 
29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
30 
31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33    In the latter case, it is important to set a_dual's sync state correctly.
34  */
35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) {
36   Mat_SeqAIJ       *aijseq;
37   Mat_SeqAIJKokkos *aijkok;
38 
39   PetscFunctionBegin;
40   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
41   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
42 
43   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
44   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
45 
46   /* If aijkok does not exist, we just copy i, j to device.
47      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
48      In both cases, we build a new aijkok structure.
49   */
50   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
51     delete aijkok;
52     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
53     A->spptr = aijkok;
54   }
55 
56   if (aijkok->device_mat_d.data()) {
57     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
58   }
59   PetscFunctionReturn(0);
60 }
61 
62 /* Sync CSR data to device if not yet */
63 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) {
64   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
65 
66   PetscFunctionBegin;
67   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device");
68   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
69   if (aijkok->a_dual.need_sync_device()) {
70     aijkok->a_dual.sync_device();
71     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
72     aijkok->hermitian_updated = PETSC_FALSE;
73   }
74   PetscFunctionReturn(0);
75 }
76 
77 /* Mark the CSR data on device as modified */
78 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) {
79   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
80 
81   PetscFunctionBegin;
82   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
83   aijkok->a_dual.clear_sync_state();
84   aijkok->a_dual.modify_device();
85   aijkok->transpose_updated = PETSC_FALSE;
86   aijkok->hermitian_updated = PETSC_FALSE;
87   PetscCall(MatSeqAIJInvalidateDiagonal(A));
88   PetscCall(PetscObjectStateIncrease((PetscObject)A));
89   PetscFunctionReturn(0);
90 }
91 
92 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) {
93   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
94 
95   PetscFunctionBegin;
96   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
97   /* We do not expect one needs factors on host  */
98   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host");
99   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
100   aijkok->a_dual.sync_host();
101   PetscFunctionReturn(0);
102 }
103 
104 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) {
105   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
106 
107   PetscFunctionBegin;
108   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
109     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
110     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
111     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
112   */
113   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
114     aijkok->a_dual.sync_host();
115     *array = aijkok->a_dual.view_host().data();
116   } else { /* Happens when calling MatSetValues on a newly created matrix */
117     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
118   }
119   PetscFunctionReturn(0);
120 }
121 
122 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) {
123   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
124 
125   PetscFunctionBegin;
126   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
127   PetscFunctionReturn(0);
128 }
129 
130 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) {
131   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
132 
133   PetscFunctionBegin;
134   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
135     aijkok->a_dual.sync_host();
136     *array = aijkok->a_dual.view_host().data();
137   } else {
138     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
139   }
140   PetscFunctionReturn(0);
141 }
142 
143 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) {
144   PetscFunctionBegin;
145   *array = NULL;
146   PetscFunctionReturn(0);
147 }
148 
149 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) {
150   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
151 
152   PetscFunctionBegin;
153   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
154     *array = aijkok->a_dual.view_host().data();
155   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
156     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
157   }
158   PetscFunctionReturn(0);
159 }
160 
161 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) {
162   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
163 
164   PetscFunctionBegin;
165   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
166     aijkok->a_dual.clear_sync_state();
167     aijkok->a_dual.modify_host();
168   }
169   PetscFunctionReturn(0);
170 }
171 
172 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) {
173   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
174 
175   PetscFunctionBegin;
176   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
177 
178   if (i) *i = aijkok->i_device_data();
179   if (j) *j = aijkok->j_device_data();
180   if (a) {
181     aijkok->a_dual.sync_device();
182     *a = aijkok->a_device_data();
183   }
184   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
185   PetscFunctionReturn(0);
186 }
187 
188 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
189 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) {
190   Mat_SeqAIJKokkos                            *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
191   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
192 
193   PetscFunctionBegin;
194   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
195   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k);
196   Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k);
197   PetscFunctionReturn(0);
198 }
199 
200 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
201 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) {
202   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
203 
204   PetscFunctionBegin;
205   if (aijkok && aijkok->device_mat_d.data()) {
206     *d_mat = aijkok->device_mat_d.data();
207   } else {
208     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
209     *d_mat = NULL;
210   }
211   PetscFunctionReturn(0);
212 }
213 
214 /* Generate the transpose on device and cache it internally */
215 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) {
216   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
217 
218   PetscFunctionBegin;
219   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
220   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
221     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
222     PetscCallCXX(aijkok->a_dual.sync_device());
223     PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat));
224     PetscCallCXX(sort_crs_matrix(aijkok->csrmatT));
225     aijkok->transpose_updated = PETSC_TRUE;
226   }
227   *csrmatT = &aijkok->csrmatT;
228   PetscFunctionReturn(0);
229 }
230 
231 /* Generate the Hermitian on device and cache it internally */
232 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) {
233   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
234 
235   PetscFunctionBegin;
236   PetscCall(PetscLogGpuTimeBegin());
237   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
238   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
239     PetscCallCXX(aijkok->a_dual.sync_device());
240     PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat));
241     PetscCallCXX(sort_crs_matrix(aijkok->csrmatH));
242 #if defined(PETSC_USE_COMPLEX)
243     const auto &a = aijkok->csrmatH.values;
244     Kokkos::parallel_for(
245       a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); });
246 #endif
247     aijkok->hermitian_updated = PETSC_TRUE;
248   }
249   *csrmatH = &aijkok->csrmatH;
250   PetscCall(PetscLogGpuTimeEnd());
251   PetscFunctionReturn(0);
252 }
253 
254 /* y = A x */
255 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) {
256   Mat_SeqAIJKokkos          *aijkok;
257   ConstPetscScalarKokkosView xv;
258   PetscScalarKokkosView      yv;
259 
260   PetscFunctionBegin;
261   PetscCall(PetscLogGpuTimeBegin());
262   PetscCall(MatSeqAIJKokkosSyncDevice(A));
263   PetscCall(VecGetKokkosView(xx, &xv));
264   PetscCall(VecGetKokkosViewWrite(yy, &yv));
265   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
266   KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A x + beta y */
267   PetscCall(VecRestoreKokkosView(xx, &xv));
268   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
269   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
270   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
271   PetscCall(PetscLogGpuTimeEnd());
272   PetscFunctionReturn(0);
273 }
274 
275 /* y = A^T x */
276 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) {
277   Mat_SeqAIJKokkos          *aijkok;
278   const char                *mode;
279   ConstPetscScalarKokkosView xv;
280   PetscScalarKokkosView      yv;
281   KokkosCsrMatrix           *csrmat;
282 
283   PetscFunctionBegin;
284   PetscCall(PetscLogGpuTimeBegin());
285   PetscCall(MatSeqAIJKokkosSyncDevice(A));
286   PetscCall(VecGetKokkosView(xx, &xv));
287   PetscCall(VecGetKokkosViewWrite(yy, &yv));
288   if (A->form_explicit_transpose) {
289     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
290     mode = "N";
291   } else {
292     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
293     csrmat = &aijkok->csrmat;
294     mode   = "T";
295   }
296   KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A^T x + beta y */
297   PetscCall(VecRestoreKokkosView(xx, &xv));
298   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
299   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
300   PetscCall(PetscLogGpuTimeEnd());
301   PetscFunctionReturn(0);
302 }
303 
304 /* y = A^H x */
305 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) {
306   Mat_SeqAIJKokkos          *aijkok;
307   const char                *mode;
308   ConstPetscScalarKokkosView xv;
309   PetscScalarKokkosView      yv;
310   KokkosCsrMatrix           *csrmat;
311 
312   PetscFunctionBegin;
313   PetscCall(PetscLogGpuTimeBegin());
314   PetscCall(MatSeqAIJKokkosSyncDevice(A));
315   PetscCall(VecGetKokkosView(xx, &xv));
316   PetscCall(VecGetKokkosViewWrite(yy, &yv));
317   if (A->form_explicit_transpose) {
318     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
319     mode = "N";
320   } else {
321     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
322     csrmat = &aijkok->csrmat;
323     mode   = "C";
324   }
325   KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A^H x + beta y */
326   PetscCall(VecRestoreKokkosView(xx, &xv));
327   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
328   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
329   PetscCall(PetscLogGpuTimeEnd());
330   PetscFunctionReturn(0);
331 }
332 
333 /* z = A x + y */
334 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) {
335   Mat_SeqAIJKokkos          *aijkok;
336   ConstPetscScalarKokkosView xv, yv;
337   PetscScalarKokkosView      zv;
338 
339   PetscFunctionBegin;
340   PetscCall(PetscLogGpuTimeBegin());
341   PetscCall(MatSeqAIJKokkosSyncDevice(A));
342   PetscCall(VecGetKokkosView(xx, &xv));
343   PetscCall(VecGetKokkosView(yy, &yv));
344   PetscCall(VecGetKokkosViewWrite(zz, &zv));
345   if (zz != yy) Kokkos::deep_copy(zv, yv);
346   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
347   KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A x + beta z */
348   PetscCall(VecRestoreKokkosView(xx, &xv));
349   PetscCall(VecRestoreKokkosView(yy, &yv));
350   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
351   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
352   PetscCall(PetscLogGpuTimeEnd());
353   PetscFunctionReturn(0);
354 }
355 
356 /* z = A^T x + y */
357 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) {
358   Mat_SeqAIJKokkos          *aijkok;
359   const char                *mode;
360   ConstPetscScalarKokkosView xv, yv;
361   PetscScalarKokkosView      zv;
362   KokkosCsrMatrix           *csrmat;
363 
364   PetscFunctionBegin;
365   PetscCall(PetscLogGpuTimeBegin());
366   PetscCall(MatSeqAIJKokkosSyncDevice(A));
367   PetscCall(VecGetKokkosView(xx, &xv));
368   PetscCall(VecGetKokkosView(yy, &yv));
369   PetscCall(VecGetKokkosViewWrite(zz, &zv));
370   if (zz != yy) Kokkos::deep_copy(zv, yv);
371   if (A->form_explicit_transpose) {
372     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
373     mode = "N";
374   } else {
375     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
376     csrmat = &aijkok->csrmat;
377     mode   = "T";
378   }
379   KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A^T x + beta z */
380   PetscCall(VecRestoreKokkosView(xx, &xv));
381   PetscCall(VecRestoreKokkosView(yy, &yv));
382   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
383   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
384   PetscCall(PetscLogGpuTimeEnd());
385   PetscFunctionReturn(0);
386 }
387 
388 /* z = A^H x + y */
389 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) {
390   Mat_SeqAIJKokkos          *aijkok;
391   const char                *mode;
392   ConstPetscScalarKokkosView xv, yv;
393   PetscScalarKokkosView      zv;
394   KokkosCsrMatrix           *csrmat;
395 
396   PetscFunctionBegin;
397   PetscCall(PetscLogGpuTimeBegin());
398   PetscCall(MatSeqAIJKokkosSyncDevice(A));
399   PetscCall(VecGetKokkosView(xx, &xv));
400   PetscCall(VecGetKokkosView(yy, &yv));
401   PetscCall(VecGetKokkosViewWrite(zz, &zv));
402   if (zz != yy) Kokkos::deep_copy(zv, yv);
403   if (A->form_explicit_transpose) {
404     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
405     mode = "N";
406   } else {
407     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
408     csrmat = &aijkok->csrmat;
409     mode   = "C";
410   }
411   KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A^H x + beta z */
412   PetscCall(VecRestoreKokkosView(xx, &xv));
413   PetscCall(VecRestoreKokkosView(yy, &yv));
414   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
415   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
416   PetscCall(PetscLogGpuTimeEnd());
417   PetscFunctionReturn(0);
418 }
419 
420 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) {
421   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
422 
423   PetscFunctionBegin;
424   switch (op) {
425   case MAT_FORM_EXPLICIT_TRANSPOSE:
426     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
427     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
428     A->form_explicit_transpose = flg;
429     break;
430   default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break;
431   }
432   PetscFunctionReturn(0);
433 }
434 
435 /* Depending on reuse, either build a new mat, or use the existing mat */
436 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) {
437   Mat_SeqAIJ *aseq;
438 
439   PetscFunctionBegin;
440   PetscCall(PetscKokkosInitializeCheck());
441   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
442     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
443   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
444     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
445   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
446     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
447     PetscCall(PetscFree(A->defaultvectype));
448     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
449     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
450     PetscCall(MatSetOps_SeqAIJKokkos(A));
451     aseq = static_cast<Mat_SeqAIJ *>(A->data);
452     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
453       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
454       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
455     }
456   }
457   PetscFunctionReturn(0);
458 }
459 
460 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
461    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
462  */
463 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) {
464   Mat_SeqAIJ       *bseq;
465   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
466   Mat               mat;
467 
468   PetscFunctionBegin;
469   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
470   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
471   mat = *B;
472   if (A->assembled) {
473     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
474     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
475     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
476     /* Now copy values to B if needed */
477     if (dupOption == MAT_COPY_VALUES) {
478       if (akok->a_dual.need_sync_device()) {
479         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
480         bkok->a_dual.modify_host();
481       } else { /* If device has the latest data, we only copy data on device */
482         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
483         bkok->a_dual.modify_device();
484       }
485     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
486       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
487       bkok->a_dual.modify_host();
488     }
489     mat->spptr = bkok;
490   }
491 
492   PetscCall(PetscFree(mat->defaultvectype));
493   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
494   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
495   PetscCall(MatSetOps_SeqAIJKokkos(mat));
496   PetscFunctionReturn(0);
497 }
498 
499 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) {
500   Mat               At;
501   KokkosCsrMatrix  *internT;
502   Mat_SeqAIJKokkos *atkok, *bkok;
503 
504   PetscFunctionBegin;
505   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
506   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
507   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
508     /* Deep copy internT, as we want to isolate the internal transpose */
509     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT)));
510     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
511     if (reuse == MAT_INITIAL_MATRIX) *B = At;
512     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
513   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
514     if ((*B)->assembled) {
515       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
516       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values));
517       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
518     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
519       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
520       MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */
521       MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz());
522       PetscCallCXX(Kokkos::deep_copy(a_h, internT->values));
523       PetscCallCXX(Kokkos::deep_copy(j_h, internT->graph.entries));
524     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
525   }
526   PetscFunctionReturn(0);
527 }
528 
529 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) {
530   Mat_SeqAIJKokkos *aijkok;
531 
532   PetscFunctionBegin;
533   if (A->factortype == MAT_FACTOR_NONE) {
534     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
535     delete aijkok;
536   } else {
537     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
538   }
539   A->spptr = NULL;
540   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
541   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
542   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
543   PetscCall(MatDestroy_SeqAIJ(A));
544   PetscFunctionReturn(0);
545 }
546 
547 /*MC
548    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
549 
550    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
551 
552    Options Database Keys:
553 .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
554 
555   Level: beginner
556 
557 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
558 M*/
559 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) {
560   PetscFunctionBegin;
561   PetscCall(PetscKokkosInitializeCheck());
562   PetscCall(MatCreate_SeqAIJ(A));
563   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
564   PetscFunctionReturn(0);
565 }
566 
567 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
568 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) {
569   Mat_SeqAIJ         *a, *b;
570   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
571   MatScalarKokkosView aa, ba, ca;
572   MatRowMapKokkosView ai, bi, ci;
573   MatColIdxKokkosView aj, bj, cj;
574   PetscInt            m, n, nnz, aN;
575 
576   PetscFunctionBegin;
577   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
578   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
579   PetscValidPointer(C, 4);
580   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
581   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
582   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
583   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
584 
585   PetscCall(MatSeqAIJKokkosSyncDevice(A));
586   PetscCall(MatSeqAIJKokkosSyncDevice(B));
587   a    = static_cast<Mat_SeqAIJ *>(A->data);
588   b    = static_cast<Mat_SeqAIJ *>(B->data);
589   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
590   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
591   aa   = akok->a_dual.view_device();
592   ai   = akok->i_dual.view_device();
593   ba   = bkok->a_dual.view_device();
594   bi   = bkok->i_dual.view_device();
595   m    = A->rmap->n; /* M, N and nnz of C */
596   n    = A->cmap->n + B->cmap->n;
597   nnz  = a->nz + b->nz;
598   aN   = A->cmap->n; /* N of A */
599   if (reuse == MAT_INITIAL_MATRIX) {
600     aj           = akok->j_dual.view_device();
601     bj           = bkok->j_dual.view_device();
602     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
603     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
604     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
605     ca           = ca_dual.view_device();
606     ci           = ci_dual.view_device();
607     cj           = cj_dual.view_device();
608 
609     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
610     Kokkos::parallel_for(
611       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
612         PetscInt i       = t.league_rank(); /* row i */
613         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
614 
615         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
616                                                    ci(i) = coffset;
617                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
618         });
619 
620         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
621           if (k < alen) {
622             ca(coffset + k) = aa(ai(i) + k);
623             cj(coffset + k) = aj(ai(i) + k);
624           } else {
625             ca(coffset + k) = ba(bi(i) + k - alen);
626             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
627           }
628         });
629       });
630     ca_dual.modify_device();
631     ci_dual.modify_device();
632     cj_dual.modify_device();
633     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
634     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
635   } else if (reuse == MAT_REUSE_MATRIX) {
636     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
637     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
638     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
639     ca   = ckok->a_dual.view_device();
640     ci   = ckok->i_dual.view_device();
641 
642     Kokkos::parallel_for(
643       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
644         PetscInt i    = t.league_rank(); /* row i */
645         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
646         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
647           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
648           else ca(ci(i) + k) = ba(bi(i) + k - alen);
649         });
650       });
651     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
652   }
653   PetscFunctionReturn(0);
654 }
655 
656 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) {
657   PetscFunctionBegin;
658   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
659   PetscFunctionReturn(0);
660 }
661 
662 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) {
663   Mat_Product                 *product = C->product;
664   Mat                          A, B;
665   bool                         transA, transB; /* use bool, since KK needs this type */
666   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
667   Mat_SeqAIJ                  *c;
668   MatProductData_SeqAIJKokkos *pdata;
669   KokkosCsrMatrix             *csrmatA, *csrmatB;
670 
671   PetscFunctionBegin;
672   MatCheckProduct(C, 1);
673   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
674   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
675 
676   if (pdata->reusesym) {           /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
677     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
678     PetscFunctionReturn(0);
679   }
680 
681   switch (product->type) {
682   case MATPRODUCT_AB:
683     transA = false;
684     transB = false;
685     break;
686   case MATPRODUCT_AtB:
687     transA = true;
688     transB = false;
689     break;
690   case MATPRODUCT_ABt:
691     transA = false;
692     transB = true;
693     break;
694   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
695   }
696 
697   A = product->A;
698   B = product->B;
699   PetscCall(MatSeqAIJKokkosSyncDevice(A));
700   PetscCall(MatSeqAIJKokkosSyncDevice(B));
701   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
702   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
703   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
704 
705   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
706 
707   csrmatA = &akok->csrmat;
708   csrmatB = &bkok->csrmat;
709 
710   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
711   if (transA) {
712     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
713     transA = false;
714   }
715 
716   if (transB) {
717     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
718     transB = false;
719   }
720   PetscCall(PetscLogGpuTimeBegin());
721   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat));
722   PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
723   PetscCall(PetscLogGpuTimeEnd());
724   PetscCall(MatSeqAIJKokkosModifyDevice(C));
725   /* shorter version of MatAssemblyEnd_SeqAIJ */
726   c = (Mat_SeqAIJ *)C->data;
727   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
728   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
729   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
730   c->reallocs         = 0;
731   C->info.mallocs     = 0;
732   C->info.nz_unneeded = 0;
733   C->assembled = C->was_assembled = PETSC_TRUE;
734   C->num_ass++;
735   PetscFunctionReturn(0);
736 }
737 
738 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) {
739   Mat_Product                 *product = C->product;
740   MatProductType               ptype;
741   Mat                          A, B;
742   bool                         transA, transB;
743   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
744   MatProductData_SeqAIJKokkos *pdata;
745   MPI_Comm                     comm;
746   KokkosCsrMatrix             *csrmatA, *csrmatB, csrmatC;
747 
748   PetscFunctionBegin;
749   MatCheckProduct(C, 1);
750   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
751   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
752   A = product->A;
753   B = product->B;
754   PetscCall(MatSeqAIJKokkosSyncDevice(A));
755   PetscCall(MatSeqAIJKokkosSyncDevice(B));
756   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
757   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
758   csrmatA = &akok->csrmat;
759   csrmatB = &bkok->csrmat;
760 
761   ptype = product->type;
762   switch (ptype) {
763   case MATPRODUCT_AB:
764     transA = false;
765     transB = false;
766     break;
767   case MATPRODUCT_AtB:
768     transA = true;
769     transB = false;
770     break;
771   case MATPRODUCT_ABt:
772     transA = false;
773     transB = true;
774     break;
775   default: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
776   }
777 
778   product->data = pdata = new MatProductData_SeqAIJKokkos();
779   pdata->kh.set_team_work_size(16);
780   pdata->kh.set_dynamic_scheduling(true);
781   pdata->reusesym = product->api_user;
782 
783   /* TODO: add command line options to select spgemm algorithms */
784   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
785   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
786      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
787    */
788   pdata->kh.create_spgemm_handle(spgemm_alg);
789 
790   PetscCall(PetscLogGpuTimeBegin());
791   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
792   if (transA) {
793     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
794     transA = false;
795   }
796 
797   if (transB) {
798     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
799     transB = false;
800   }
801 
802   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC));
803   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
804     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
805     calling new Mat_SeqAIJKokkos().
806     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
807   */
808   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC));
809   PetscCallCXX(sort_crs_matrix(csrmatC));
810   PetscCall(PetscLogGpuTimeEnd());
811 
812   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
813   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
814   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
815   PetscFunctionReturn(0);
816 }
817 
818 /* handles sparse matrix matrix ops */
819 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) {
820   Mat_Product *product = mat->product;
821   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
822 
823   PetscFunctionBegin;
824   MatCheckProduct(mat, 1);
825   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
826   if (product->type == MATPRODUCT_ABC) { PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); }
827   if (Biskok && Ciskok) {
828     switch (product->type) {
829     case MATPRODUCT_AB:
830     case MATPRODUCT_AtB:
831     case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; break;
832     case MATPRODUCT_PtAP:
833     case MATPRODUCT_RARt:
834     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
835     default: break;
836     }
837   } else { /* fallback for AIJ */
838     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
839   }
840   PetscFunctionReturn(0);
841 }
842 
843 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) {
844   Mat_SeqAIJKokkos *aijkok;
845 
846   PetscFunctionBegin;
847   PetscCall(PetscLogGpuTimeBegin());
848   PetscCall(MatSeqAIJKokkosSyncDevice(A));
849   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
850   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
851   PetscCall(MatSeqAIJKokkosModifyDevice(A));
852   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
853   PetscCall(PetscLogGpuTimeEnd());
854   PetscFunctionReturn(0);
855 }
856 
857 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) {
858   Mat_SeqAIJKokkos *aijkok;
859 
860   PetscFunctionBegin;
861   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
862   if (aijkok) { /* Only zero the device if data is already there */
863     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
864     PetscCall(MatSeqAIJKokkosModifyDevice(A));
865   } else { /* Might be preallocated but not assembled */
866     PetscCall(MatZeroEntries_SeqAIJ(A));
867   }
868   PetscFunctionReturn(0);
869 }
870 
871 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) {
872   Mat_SeqAIJ           *aijseq;
873   Mat_SeqAIJKokkos     *aijkok;
874   PetscInt              n;
875   PetscScalarKokkosView xv;
876 
877   PetscFunctionBegin;
878   PetscCall(VecGetLocalSize(x, &n));
879   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
880   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
881 
882   PetscCall(MatSeqAIJKokkosSyncDevice(A));
883   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
884 
885   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
886     PetscCall(MatMarkDiagonal_SeqAIJ(A));
887     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
888     aijkok->SetDiagonal(aijseq->diag);
889   }
890 
891   const auto &Aa    = aijkok->a_dual.view_device();
892   const auto &Ai    = aijkok->i_dual.view_device();
893   const auto &Adiag = aijkok->diag_dual.view_device();
894 
895   PetscCall(VecGetKokkosViewWrite(x, &xv));
896   Kokkos::parallel_for(
897     n, KOKKOS_LAMBDA(const PetscInt i) {
898       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
899       else xv(i) = 0;
900     });
901   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
902   PetscFunctionReturn(0);
903 }
904 
905 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
906 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) {
907   Mat_SeqAIJKokkos *aijkok;
908 
909   PetscFunctionBegin;
910   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
911   PetscValidPointer(kv, 2);
912   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
913   PetscCall(MatSeqAIJKokkosSyncDevice(A));
914   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
915   *kv    = aijkok->a_dual.view_device();
916   PetscFunctionReturn(0);
917 }
918 
919 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) {
920   PetscFunctionBegin;
921   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
922   PetscValidPointer(kv, 2);
923   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
924   PetscFunctionReturn(0);
925 }
926 
927 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) {
928   Mat_SeqAIJKokkos *aijkok;
929 
930   PetscFunctionBegin;
931   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
932   PetscValidPointer(kv, 2);
933   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
934   PetscCall(MatSeqAIJKokkosSyncDevice(A));
935   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
936   *kv    = aijkok->a_dual.view_device();
937   PetscFunctionReturn(0);
938 }
939 
940 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) {
941   PetscFunctionBegin;
942   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
943   PetscValidPointer(kv, 2);
944   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
945   PetscCall(MatSeqAIJKokkosModifyDevice(A));
946   PetscFunctionReturn(0);
947 }
948 
949 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) {
950   Mat_SeqAIJKokkos *aijkok;
951 
952   PetscFunctionBegin;
953   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
954   PetscValidPointer(kv, 2);
955   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
956   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
957   *kv    = aijkok->a_dual.view_device();
958   PetscFunctionReturn(0);
959 }
960 
961 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) {
962   PetscFunctionBegin;
963   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
964   PetscValidPointer(kv, 2);
965   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
966   PetscCall(MatSeqAIJKokkosModifyDevice(A));
967   PetscFunctionReturn(0);
968 }
969 
970 /* Computes Y += alpha X */
971 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) {
972   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
973   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
974   ConstMatScalarKokkosView Xa;
975   MatScalarKokkosView      Ya;
976 
977   PetscFunctionBegin;
978   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
979   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
980   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
981   PetscCall(MatSeqAIJKokkosSyncDevice(X));
982   PetscCall(PetscLogGpuTimeBegin());
983 
984   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
985     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
986     PetscBool e;
987     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
988     if (e) {
989       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
990       if (e) pattern = SAME_NONZERO_PATTERN;
991     }
992   }
993 
994   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
995     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
996     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
997     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
998   */
999   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1000   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1001   Xa   = xkok->a_dual.view_device();
1002   Ya   = ykok->a_dual.view_device();
1003 
1004   if (pattern == SAME_NONZERO_PATTERN) {
1005     KokkosBlas::axpy(alpha, Xa, Ya);
1006     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1007   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1008     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1009     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1010 
1011     Kokkos::parallel_for(
1012       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1013         PetscInt i = t.league_rank();              /* row i */
1014         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */
1015                                                    PetscInt p, q = Yi(i);
1016                                                    for (p = Xi(i); p < Xi(i + 1); p++) {          /* For each nonzero on row i of X */
1017                                                      while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */
1018                                                      if (Xj(p) == Yj(q)) {                        /* Found it */
1019                                                        Ya(q) += alpha * Xa(p);
1020                                                        q++;
1021                                                      } else {
1022                                                        /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1023                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1024             */
1025                                                        if (Yi(i) != Yi(i + 1))
1026                                                          Ya(Yi(i)) =
1027 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99)
1028                                                            Kokkos::nan("1"); /* auto promote the double NaN if needed */
1029 #else
1030               Kokkos::Experimental::nan("1");
1031 #endif
1032                                                      }
1033                                                    }
1034         });
1035       });
1036     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1037   } else { /* different nonzero patterns */
1038     Mat             Z;
1039     KokkosCsrMatrix zcsr;
1040     KernelHandle    kh;
1041     kh.create_spadd_handle(false);
1042     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1043     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1044     zkok = new Mat_SeqAIJKokkos(zcsr);
1045     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1046     PetscCall(MatHeaderReplace(Y, &Z));
1047     kh.destroy_spadd_handle();
1048   }
1049   PetscCall(PetscLogGpuTimeEnd());
1050   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); /* Because we scaled X and then added it to Y */
1051   PetscFunctionReturn(0);
1052 }
1053 
1054 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) {
1055   Mat_SeqAIJKokkos *akok;
1056   Mat_SeqAIJ       *aseq;
1057 
1058   PetscFunctionBegin;
1059   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1060   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1061   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1062   delete akok;
1063   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1064   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1065   akok->SetUpCOO(aseq);
1066   PetscFunctionReturn(0);
1067 }
1068 
1069 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) {
1070   Mat_SeqAIJ                 *aseq = static_cast<Mat_SeqAIJ *>(A->data);
1071   Mat_SeqAIJKokkos           *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1072   PetscCount                  Annz = aseq->nz;
1073   const PetscCountKokkosView &jmap = akok->jmap_d;
1074   const PetscCountKokkosView &perm = akok->perm_d;
1075   MatScalarKokkosView         Aa;
1076   ConstMatScalarKokkosView    kv;
1077   PetscMemType                memtype;
1078 
1079   PetscFunctionBegin;
1080   PetscCall(PetscGetMemType(v, &memtype));
1081   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1082     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
1083   } else {
1084     kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
1085   }
1086 
1087   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1088   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1089 
1090   Kokkos::parallel_for(
1091     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1092       PetscScalar sum = 0.0;
1093       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1094       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1095     });
1096 
1097   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1098   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1099   PetscFunctionReturn(0);
1100 }
1101 
1102 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) {
1103   Mat_SeqAIJKokkos          *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1104   MatScalarKokkosView        Aa;
1105   const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
1106   PetscInt                   m  = A->rmap->n;
1107   ConstMatRowMapKokkosView   Adiag(diag, m); /* diag is a device pointer */
1108 
1109   PetscFunctionBegin;
1110   PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa));
1111   Kokkos::parallel_for(
1112     m, KOKKOS_LAMBDA(const PetscInt i) {
1113       PetscScalar tmp;
1114       if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
1115         tmp          = Aa(Ai(i));
1116         Aa(Ai(i))    = Aa(Adiag(i));
1117         Aa(Adiag(i)) = tmp;
1118       }
1119     });
1120   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1121   PetscFunctionReturn(0);
1122 }
1123 
1124 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) {
1125   PetscFunctionBegin;
1126   PetscCall(MatSeqAIJKokkosSyncHost(A));
1127   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1128   B->offloadmask = PETSC_OFFLOAD_CPU;
1129   PetscFunctionReturn(0);
1130 }
1131 
1132 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) {
1133   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1134 
1135   PetscFunctionBegin;
1136   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1137   A->boundtocpu  = PETSC_FALSE;
1138 
1139   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1140   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1141   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1142   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1143   A->ops->scale                     = MatScale_SeqAIJKokkos;
1144   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1145   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1146   A->ops->mult                      = MatMult_SeqAIJKokkos;
1147   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1148   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1149   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1150   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1151   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1152   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1153   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1154   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1155   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1156   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1157   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1158   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1159   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1160   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1161   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1162   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1163 
1164   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1165   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1166   PetscFunctionReturn(0);
1167 }
1168 
1169 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) {
1170   Mat_SeqAIJ *aseq;
1171   PetscInt    i, m, n;
1172 
1173   PetscFunctionBegin;
1174   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1175 
1176   m = akok->nrows();
1177   n = akok->ncols();
1178   PetscCall(MatSetSizes(A, m, n, m, n));
1179   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1180 
1181   /* Set up data structures of A as a MATSEQAIJ */
1182   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1183   aseq = (Mat_SeqAIJ *)(A)->data;
1184 
1185   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1186   akok->j_dual.sync_host();
1187 
1188   aseq->i            = akok->i_host_data();
1189   aseq->j            = akok->j_host_data();
1190   aseq->a            = akok->a_host_data();
1191   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1192   aseq->singlemalloc = PETSC_FALSE;
1193   aseq->free_a       = PETSC_FALSE;
1194   aseq->free_ij      = PETSC_FALSE;
1195   aseq->nz           = akok->nnz();
1196   aseq->maxnz        = aseq->nz;
1197 
1198   PetscCall(PetscMalloc1(m, &aseq->imax));
1199   PetscCall(PetscMalloc1(m, &aseq->ilen));
1200   for (i = 0; i < m; i++) { aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; }
1201 
1202   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1203   akok->nonzerostate = A->nonzerostate;
1204   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1205   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1206   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1207   PetscFunctionReturn(0);
1208 }
1209 
1210 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1211 
1212    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1213  */
1214 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) {
1215   PetscFunctionBegin;
1216   PetscCall(MatCreate(comm, A));
1217   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1218   PetscFunctionReturn(0);
1219 }
1220 
1221 /* --------------------------------------------------------------------------------*/
1222 /*@C
1223    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1224    (the default parallel PETSc format). This matrix will ultimately be handled by
1225    Kokkos for calculations. For good matrix
1226    assembly performance the user should preallocate the matrix storage by setting
1227    the parameter nz (or the array nnz).  By setting these parameters accurately,
1228    performance during matrix assembly can be increased by more than a factor of 50.
1229 
1230    Collective
1231 
1232    Input Parameters:
1233 +  comm - MPI communicator, set to PETSC_COMM_SELF
1234 .  m - number of rows
1235 .  n - number of columns
1236 .  nz - number of nonzeros per row (same for all rows)
1237 -  nnz - array containing the number of nonzeros in the various rows
1238          (possibly different for each row) or NULL
1239 
1240    Output Parameter:
1241 .  A - the matrix
1242 
1243    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1244    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1245    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1246 
1247    Notes:
1248    If nnz is given then nz is ignored
1249 
1250    The AIJ format (also called the Yale sparse matrix format or
1251    compressed row storage), is fully compatible with standard Fortran 77
1252    storage.  That is, the stored row and column indices can begin at
1253    either one (as in Fortran) or zero.  See the users' manual for details.
1254 
1255    Specify the preallocated storage with either nz or nnz (not both).
1256    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1257    allocation.  For large problems you MUST preallocate memory or you
1258    will get TERRIBLE performance, see the users' manual chapter on matrices.
1259 
1260    By default, this format uses inodes (identical nodes) when possible, to
1261    improve numerical efficiency of matrix-vector products and solves. We
1262    search for consecutive rows with the same nonzero structure, thereby
1263    reusing matrix information to achieve increased efficiency.
1264 
1265    Level: intermediate
1266 
1267 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1268 @*/
1269 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
1270   PetscFunctionBegin;
1271   PetscCall(PetscKokkosInitializeCheck());
1272   PetscCall(MatCreate(comm, A));
1273   PetscCall(MatSetSizes(*A, m, n, m, n));
1274   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1275   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1276   PetscFunctionReturn(0);
1277 }
1278 
1279 typedef Kokkos::TeamPolicy<>::member_type team_member;
1280 //
1281 // This factorization exploits block diagonal matrices with "Nf" (not used).
1282 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1283 //
1284 static PetscErrorCode                     MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info) {
1285                       Mat_SeqAIJ       *b      = (Mat_SeqAIJ *)B->data;
1286                       Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1287                       IS                isrow = b->row, isicol = b->icol;
1288                       const PetscInt   *r_h, *ic_h;
1289                       const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1290                       const PetscScalar *aa_d = aijkok->a_dual.view_device().data();
1291                       PetscScalar       *ba_d = baijkok->a_dual.view_device().data();
1292                       PetscBool          row_identity, col_identity;
1293                       PetscInt           nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used
1294 
1295                       PetscFunctionBegin;
1296                       PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n);
1297                       PetscCall(MatIsStructurallySymmetric(A, &row_identity));
1298                       PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported");
1299                       PetscCall(ISGetIndices(isrow, &r_h));
1300                       PetscCall(ISGetIndices(isicol, &ic_h));
1301                       PetscCall(ISGetSize(isicol, &nc));
1302                       PetscCall(PetscLogGpuTimeBegin());
1303                       PetscCall(MatSeqAIJKokkosSyncDevice(A));
1304                       {
1305 #define KOKKOS_SHARED_LEVEL 1
1306     using scr_mem_t    = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1307     using sizet_scr_t  = Kokkos::View<size_t, scr_mem_t>;
1308     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1309     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n);
1310     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_r_k("r", n);
1311     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc);
1312     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_ic_k("ic", nc);
1313     size_t                                                                                                               flops_h = 0.0;
1314     Kokkos::View<size_t, Kokkos::HostSpace>                                                                              h_flops_k(&flops_h);
1315     Kokkos::View<size_t>                                                                                                 d_flops_k("flops");
1316     const int                                                                                                            conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1317     const int                                                                                                            nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1318     Kokkos::deep_copy(d_flops_k, h_flops_k);
1319     Kokkos::deep_copy(d_r_k, h_r_k);
1320     Kokkos::deep_copy(d_ic_k, h_ic_k);
1321     // Fill A --> fact
1322     Kokkos::parallel_for(
1323       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) {
1324         const PetscInt  field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA
1325         const PetscInt  nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
1326         const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data();
1327         // zero rows of B
1328         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1329           PetscInt     nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag
1330           PetscScalar *baL = ba_d + bi_d[rowb];
1331           PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1;
1332           /* zero (unfactored row) */
1333           for (int j = 0; j < nzbL; j++) baL[j] = 0;
1334           for (int j = 0; j < nzbU; j++) baU[j] = 0;
1335         });
1336         // copy A into B
1337         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1338           PetscInt           rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
1339           const PetscScalar *av    = aa_d + ai_d[rowa];
1340           const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1341           /* load in initial (unfactored row) */
1342           for (int j = 0; j < nza; j++) {
1343             PetscInt    colb = ic[ajtmp[j]];
1344             PetscScalar vala = av[j];
1345             if (colb == rowb) {
1346               *(ba_d + bdiag_d[rowb]) = vala;
1347             } else {
1348               const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1349               PetscScalar    *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1350               PetscInt        nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0;
1351               for (int j = 0; j < nz; j++) {
1352                 if (pbj[j] == colb) {
1353                   pba[j] = vala;
1354                   set++;
1355                   break;
1356                 }
1357               }
1358 #if !defined(PETSC_HAVE_SYCL)
1359               if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1360 #endif
1361             }
1362           }
1363         });
1364       });
1365     Kokkos::fence();
1366 
1367     Kokkos::parallel_for(
1368       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) {
1369         sizet_scr_t    colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1370         scalar_scr_t   L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1371         sizet_scr_t    flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1372         const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA
1373         const PetscInt start = field * nloc, end = start + nloc;
1374         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1375         // A22 panel update for each row A(1,:) and col A(:,1)
1376         for (int ii = start; ii < end - 1; ii++) {
1377           const PetscInt    *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end)
1378           const PetscScalar *baUi    = ba_d + bdiag_d[ii + 1] + 1;                                          // vector of data  U(i,i+1:end)
1379           const PetscInt     nUi_its = nzUi / Ni + !!(nzUi % Ni);
1380           const PetscScalar  Bii     = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1381           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) {
1382             PetscInt kIdx = j * Ni + field_block_idx;
1383             if (kIdx >= nzUi) /* void */
1384               ;
1385             else {
1386               const PetscInt  myk = bjUi[kIdx];                // assume symmetric structure, need a transposed meta-data here in general
1387               const PetscInt *pjL = bj_d + bi_d[myk];          // look for L(myk,ii) in start of row
1388               const PetscInt  nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:)
1389               size_t          st_idx;
1390               // find and do L(k,i) = A(:k,i) / A(i,i)
1391               Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1392               // get column, there has got to be a better way
1393               Kokkos::parallel_reduce(
1394                 Kokkos::ThreadVectorRange(team, nzL),
1395                 [&](const int &j, size_t &idx) {
1396                   if (pjL[j] == ii) {
1397                     PetscScalar *pLki = ba_d + bi_d[myk] + j;
1398                     idx               = j;           // output
1399                     *pLki             = *pLki / Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1400                   }
1401                 },
1402                 st_idx);
1403               Kokkos::single(Kokkos::PerThread(team), [=]() {
1404                 colkIdx() = st_idx;
1405                 L_ki()    = *(ba_d + bi_d[myk] + st_idx);
1406               });
1407 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1408               if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register
1409 #endif
1410               // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1411               // U(i+1,:end)
1412               Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U)
1413                 PetscScalar Uij = baUi[uiIdx];
1414                 PetscInt    col = bjUi[uiIdx];
1415                 if (col == myk) {
1416                   // A_kk = A_kk - L_ki * U_ij(k)
1417                   PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1418                   *Akkv             = *Akkv - L_ki() * Uij;  // UiK
1419                 } else {
1420                   PetscScalar    *start, *end, *pAkjv = NULL;
1421                   PetscInt        high, low;
1422                   const PetscInt *startj;
1423                   if (col < myk) { // L
1424                     PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1425                     PetscInt     idx  = (pLki + 1) - (ba_d + bi_d[myk]); // index into row
1426                     start             = pLki + 1;                        // start at pLki+1, A22(myk,1)
1427                     startj            = bj_d + bi_d[myk] + idx;
1428                     end               = ba_d + bi_d[myk + 1];
1429                   } else {
1430                     PetscInt idx = bdiag_d[myk + 1] + 1;
1431                     start        = ba_d + idx;
1432                     startj       = bj_d + idx;
1433                     end          = ba_d + bdiag_d[myk];
1434                   }
1435                   // search for 'col', use bisection search - TODO
1436                   low  = 0;
1437                   high = (PetscInt)(end - start);
1438                   while (high - low > 5) {
1439                     int t = (low + high) / 2;
1440                     if (startj[t] > col) high = t;
1441                     else low = t;
1442                   }
1443                   for (pAkjv = start + low; pAkjv < start + high; pAkjv++) {
1444                     if (startj[pAkjv - start] == col) break;
1445                   }
1446 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1447                   if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register
1448 #endif
1449                   *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1450                 }
1451               });
1452             }
1453           });
1454           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1455           if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); });
1456         } /* endof for (i=0; i<n; i++) { */
1457         Kokkos::single(Kokkos::PerTeam(team), [=]() {
1458           Kokkos::atomic_add(&d_flops_k(), flops());
1459           flops() = 0;
1460         });
1461       });
1462     Kokkos::fence();
1463     Kokkos::deep_copy(h_flops_k, d_flops_k);
1464     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1465     Kokkos::parallel_for(
1466       Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) {
1467         const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni;                            //, field_offset = lg_rank%Ni;
1468         const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters
1469         /* Invert diagonal for simpler triangular solves */
1470         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) {
1471           int i = start + outer_index * Ni + lg_rank % Ni;
1472           if (i < end) {
1473             PetscScalar *pv = ba_d + bdiag_d[i];
1474             *pv             = 1.0 / (*pv);
1475           }
1476         });
1477       });
1478   }
1479   PetscCall(PetscLogGpuTimeEnd());
1480   PetscCall(ISRestoreIndices(isicol, &ic_h));
1481   PetscCall(ISRestoreIndices(isrow, &r_h));
1482 
1483   PetscCall(ISIdentity(isrow, &row_identity));
1484   PetscCall(ISIdentity(isicol, &col_identity));
1485   if (b->inode.size) {
1486     B->ops->solve = MatSolve_SeqAIJ_Inode;
1487   } else if (row_identity && col_identity) {
1488     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1489   } else {
1490     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1491   }
1492   B->offloadmask = PETSC_OFFLOAD_GPU;
1493   PetscCall(MatSeqAIJKokkosSyncHost(B));          // solve on CPU
1494   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1495   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1496   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1497   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1498   B->assembled              = PETSC_TRUE;
1499   B->preallocated           = PETSC_TRUE;
1500 
1501   PetscFunctionReturn(0);
1502 }
1503 
1504 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1505   PetscFunctionBegin;
1506   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1507   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1508   PetscFunctionReturn(0);
1509 }
1510 
1511 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) {
1512   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1513 
1514   PetscFunctionBegin;
1515   if (!factors->sptrsv_symbolic_completed) {
1516     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1517     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1518     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1519   }
1520   PetscFunctionReturn(0);
1521 }
1522 
1523 /* Check if we need to update factors etc for transpose solve */
1524 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) {
1525   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1526   MatColIdxType               n       = A->rmap->n;
1527 
1528   PetscFunctionBegin;
1529   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1530     /* Update L^T and do sptrsv symbolic */
1531     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1);
1532     Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */
1533     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0));
1534     factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0));
1535 
1536     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1537                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1538 
1539     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1540       We have to sort the indices, until KK provides finer control options.
1541     */
1542     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1543 
1544     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1545 
1546     /* Update U^T and do sptrsv symbolic */
1547     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1);
1548     Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */
1549     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0));
1550     factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0));
1551 
1552     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1553                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1554 
1555     /* Sort indices. See comments above */
1556     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1557 
1558     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1559     factors->transpose_updated = PETSC_TRUE;
1560   }
1561   PetscFunctionReturn(0);
1562 }
1563 
1564 /* Solve Ax = b, with A = LU */
1565 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) {
1566   ConstPetscScalarKokkosView  bv;
1567   PetscScalarKokkosView       xv;
1568   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1569 
1570   PetscFunctionBegin;
1571   PetscCall(PetscLogGpuTimeBegin());
1572   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1573   PetscCall(VecGetKokkosView(b, &bv));
1574   PetscCall(VecGetKokkosViewWrite(x, &xv));
1575   /* Solve L tmpv = b */
1576   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1577   /* Solve Ux = tmpv */
1578   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1579   PetscCall(VecRestoreKokkosView(b, &bv));
1580   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1581   PetscCall(PetscLogGpuTimeEnd());
1582   PetscFunctionReturn(0);
1583 }
1584 
1585 /* Solve A^T x = b, where A^T = U^T L^T */
1586 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) {
1587   ConstPetscScalarKokkosView  bv;
1588   PetscScalarKokkosView       xv;
1589   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1590 
1591   PetscFunctionBegin;
1592   PetscCall(PetscLogGpuTimeBegin());
1593   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1594   PetscCall(VecGetKokkosView(b, &bv));
1595   PetscCall(VecGetKokkosViewWrite(x, &xv));
1596   /* Solve U^T tmpv = b */
1597   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1598 
1599   /* Solve L^T x = tmpv */
1600   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1601   PetscCall(VecRestoreKokkosView(b, &bv));
1602   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1603   PetscCall(PetscLogGpuTimeEnd());
1604   PetscFunctionReturn(0);
1605 }
1606 
1607 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) {
1608   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1609   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1610   PetscInt                    fill_lev = info->levels;
1611 
1612   PetscFunctionBegin;
1613   PetscCall(PetscLogGpuTimeBegin());
1614   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1615 
1616   auto a_d = aijkok->a_dual.view_device();
1617   auto i_d = aijkok->i_dual.view_device();
1618   auto j_d = aijkok->j_dual.view_device();
1619 
1620   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1621 
1622   B->assembled              = PETSC_TRUE;
1623   B->preallocated           = PETSC_TRUE;
1624   B->ops->solve             = MatSolve_SeqAIJKokkos;
1625   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1626   B->ops->matsolve          = NULL;
1627   B->ops->matsolvetranspose = NULL;
1628   B->offloadmask            = PETSC_OFFLOAD_GPU;
1629 
1630   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1631   factors->transpose_updated         = PETSC_FALSE;
1632   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1633   /* TODO: log flops, but how to know that? */
1634   PetscCall(PetscLogGpuTimeEnd());
1635   PetscFunctionReturn(0);
1636 }
1637 
1638 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1639   Mat_SeqAIJKokkos           *aijkok;
1640   Mat_SeqAIJ                 *b;
1641   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1642   PetscInt                    fill_lev = info->levels;
1643   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1644   PetscInt                    n        = A->rmap->n;
1645 
1646   PetscFunctionBegin;
1647   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1648   /* Rebuild factors */
1649   if (factors) {
1650     factors->Destroy();
1651   } /* Destroy the old if it exists */
1652   else {
1653     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1654   }
1655 
1656   /* Create a spiluk handle and then do symbolic factorization */
1657   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1658   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1659 
1660   auto spiluk_handle = factors->kh.get_spiluk_handle();
1661 
1662   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1663   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1664   Kokkos::realloc(factors->iU_d, n + 1);
1665   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1666 
1667   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1668   auto i_d = aijkok->i_dual.view_device();
1669   auto j_d = aijkok->j_dual.view_device();
1670   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1671   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1672 
1673   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1674   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1675   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1676   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1677 
1678   /* TODO: add options to select sptrsv algorithms */
1679   /* Create sptrsv handles for L, U and their transpose */
1680 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1681   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1682 #else
1683   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1684 #endif
1685 
1686   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1687   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1688   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1689   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1690 
1691   /* Fill fields of the factor matrix B */
1692   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1693   b     = (Mat_SeqAIJ *)B->data;
1694   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1695   B->info.fill_ratio_given  = info->fill;
1696   B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA);
1697 
1698   B->offloadmask          = PETSC_OFFLOAD_GPU;
1699   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1700   PetscFunctionReturn(0);
1701 }
1702 
1703 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1704   Mat_SeqAIJ    *b     = (Mat_SeqAIJ *)B->data;
1705   const PetscInt nrows = A->rmap->n;
1706 
1707   PetscFunctionBegin;
1708   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1709   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1710   // move B data into Kokkos
1711   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1712   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1713   {
1714     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1715     if (!baijkok->diag_d.extent(0)) {
1716       const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1);
1717       baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag));
1718       Kokkos::deep_copy(baijkok->diag_d, h_diag);
1719     }
1720   }
1721   PetscFunctionReturn(0);
1722 }
1723 
1724 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) {
1725   PetscFunctionBegin;
1726   *type = MATSOLVERKOKKOS;
1727   PetscFunctionReturn(0);
1728 }
1729 
1730 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type) {
1731   PetscFunctionBegin;
1732   *type = MATSOLVERKOKKOSDEVICE;
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 /*MC
1737   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1738   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1739 
1740   Level: beginner
1741 
1742 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1743 M*/
1744 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1745 {
1746   PetscInt n = A->rmap->n;
1747 
1748   PetscFunctionBegin;
1749   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1750   PetscCall(MatSetSizes(*B, n, n, n, n));
1751   (*B)->factortype = ftype;
1752   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1753   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1754 
1755   if (ftype == MAT_FACTOR_LU) {
1756     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1757     (*B)->canuseordering        = PETSC_TRUE;
1758     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1759   } else if (ftype == MAT_FACTOR_ILU) {
1760     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1761     (*B)->canuseordering         = PETSC_FALSE;
1762     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1763   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1764 
1765   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1766   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1767   PetscFunctionReturn(0);
1768 }
1769 
1770 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B) {
1771   PetscInt n = A->rmap->n;
1772 
1773   PetscFunctionBegin;
1774   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1775   PetscCall(MatSetSizes(*B, n, n, n, n));
1776   (*B)->factortype     = ftype;
1777   (*B)->canuseordering = PETSC_TRUE;
1778   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1779   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1780 
1781   if (ftype == MAT_FACTOR_LU) {
1782     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1783     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1784   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types");
1785 
1786   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1787   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device));
1788   PetscFunctionReturn(0);
1789 }
1790 
1791 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) {
1792   PetscFunctionBegin;
1793   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1794   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1795   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device));
1796   PetscFunctionReturn(0);
1797 }
1798 
1799 /* Utility to print out a KokkosCsrMatrix for debugging */
1800 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) {
1801   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1802   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1803   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1804   const PetscInt    *i  = iv.data();
1805   const PetscInt    *j  = jv.data();
1806   const PetscScalar *a  = av.data();
1807   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1808 
1809   PetscFunctionBegin;
1810   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1811   for (PetscInt k = 0; k < m; k++) {
1812     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1813     for (PetscInt p = i[k]; p < i[k + 1]; p++) { PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); }
1814     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1815   }
1816   PetscFunctionReturn(0);
1817 }
1818