xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision b6e6beb40ac0a73ccc19b70153df96d5058dce43)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
18 
19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99)
20   #include <KokkosSparse_Utils.hpp>
21 using KokkosSparse::sort_crs_matrix;
22 using KokkosSparse::Impl::transpose_matrix;
23 #else
24   #include <KokkosKernels_Sorting.hpp>
25 using KokkosKernels::sort_crs_matrix;
26 using KokkosKernels::Impl::transpose_matrix;
27 #endif
28 
29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
30 
31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33    In the latter case, it is important to set a_dual's sync state correctly.
34  */
35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
36 {
37   Mat_SeqAIJ       *aijseq;
38   Mat_SeqAIJKokkos *aijkok;
39 
40   PetscFunctionBegin;
41   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
42   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
43 
44   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
45   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
46 
47   /* If aijkok does not exist, we just copy i, j to device.
48      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
49      In both cases, we build a new aijkok structure.
50   */
51   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
52     delete aijkok;
53     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
54     A->spptr = aijkok;
55   }
56 
57   if (aijkok->device_mat_d.data()) {
58     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
59   }
60   PetscFunctionReturn(PETSC_SUCCESS);
61 }
62 
63 /* Sync CSR data to device if not yet */
64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
65 {
66   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
67 
68   PetscFunctionBegin;
69   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device");
70   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
71   if (aijkok->a_dual.need_sync_device()) {
72     aijkok->a_dual.sync_device();
73     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
74     aijkok->hermitian_updated = PETSC_FALSE;
75   }
76   PetscFunctionReturn(PETSC_SUCCESS);
77 }
78 
79 /* Mark the CSR data on device as modified */
80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
81 {
82   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
83 
84   PetscFunctionBegin;
85   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
86   aijkok->a_dual.clear_sync_state();
87   aijkok->a_dual.modify_device();
88   aijkok->transpose_updated = PETSC_FALSE;
89   aijkok->hermitian_updated = PETSC_FALSE;
90   PetscCall(MatSeqAIJInvalidateDiagonal(A));
91   PetscCall(PetscObjectStateIncrease((PetscObject)A));
92   PetscFunctionReturn(PETSC_SUCCESS);
93 }
94 
95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
96 {
97   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
98 
99   PetscFunctionBegin;
100   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
101   /* We do not expect one needs factors on host  */
102   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host");
103   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
104   aijkok->a_dual.sync_host();
105   PetscFunctionReturn(PETSC_SUCCESS);
106 }
107 
108 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111 
112   PetscFunctionBegin;
113   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
114     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
115     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
116     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
117   */
118   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
119     aijkok->a_dual.sync_host();
120     *array = aijkok->a_dual.view_host().data();
121   } else { /* Happens when calling MatSetValues on a newly created matrix */
122     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
123   }
124   PetscFunctionReturn(PETSC_SUCCESS);
125 }
126 
127 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
128 {
129   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
130 
131   PetscFunctionBegin;
132   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
133   PetscFunctionReturn(PETSC_SUCCESS);
134 }
135 
136 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
137 {
138   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
139 
140   PetscFunctionBegin;
141   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
142     aijkok->a_dual.sync_host();
143     *array = aijkok->a_dual.view_host().data();
144   } else {
145     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
146   }
147   PetscFunctionReturn(PETSC_SUCCESS);
148 }
149 
150 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
151 {
152   PetscFunctionBegin;
153   *array = NULL;
154   PetscFunctionReturn(PETSC_SUCCESS);
155 }
156 
157 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
158 {
159   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
160 
161   PetscFunctionBegin;
162   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163     *array = aijkok->a_dual.view_host().data();
164   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
165     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
166   }
167   PetscFunctionReturn(PETSC_SUCCESS);
168 }
169 
170 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
171 {
172   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
173 
174   PetscFunctionBegin;
175   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
176     aijkok->a_dual.clear_sync_state();
177     aijkok->a_dual.modify_host();
178   }
179   PetscFunctionReturn(PETSC_SUCCESS);
180 }
181 
182 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
183 {
184   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
185 
186   PetscFunctionBegin;
187   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
188 
189   if (i) *i = aijkok->i_device_data();
190   if (j) *j = aijkok->j_device_data();
191   if (a) {
192     aijkok->a_dual.sync_device();
193     *a = aijkok->a_device_data();
194   }
195   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
200 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
201 {
202   Mat_SeqAIJKokkos                            *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
203   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
204 
205   PetscFunctionBegin;
206   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k);
208   Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k);
209   PetscFunctionReturn(PETSC_SUCCESS);
210 }
211 
212 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
213 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
214 {
215   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
216 
217   PetscFunctionBegin;
218   if (aijkok && aijkok->device_mat_d.data()) {
219     *d_mat = aijkok->device_mat_d.data();
220   } else {
221     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
222     *d_mat = NULL;
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 /* Generate the transpose on device and cache it internally */
228 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
229 {
230   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
231 
232   PetscFunctionBegin;
233   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
234   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
235     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
236     PetscCallCXX(aijkok->a_dual.sync_device());
237     PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat));
238     PetscCallCXX(sort_crs_matrix(aijkok->csrmatT));
239     aijkok->transpose_updated = PETSC_TRUE;
240   }
241   *csrmatT = &aijkok->csrmatT;
242   PetscFunctionReturn(PETSC_SUCCESS);
243 }
244 
245 /* Generate the Hermitian on device and cache it internally */
246 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
247 {
248   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
249 
250   PetscFunctionBegin;
251   PetscCall(PetscLogGpuTimeBegin());
252   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
253   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
254     PetscCallCXX(aijkok->a_dual.sync_device());
255     PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat));
256     PetscCallCXX(sort_crs_matrix(aijkok->csrmatH));
257 #if defined(PETSC_USE_COMPLEX)
258     const auto &a = aijkok->csrmatH.values;
259     Kokkos::parallel_for(
260       a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); });
261 #endif
262     aijkok->hermitian_updated = PETSC_TRUE;
263   }
264   *csrmatH = &aijkok->csrmatH;
265   PetscCall(PetscLogGpuTimeEnd());
266   PetscFunctionReturn(PETSC_SUCCESS);
267 }
268 
269 /* y = A x */
270 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
271 {
272   Mat_SeqAIJKokkos          *aijkok;
273   ConstPetscScalarKokkosView xv;
274   PetscScalarKokkosView      yv;
275 
276   PetscFunctionBegin;
277   PetscCall(PetscLogGpuTimeBegin());
278   PetscCall(MatSeqAIJKokkosSyncDevice(A));
279   PetscCall(VecGetKokkosView(xx, &xv));
280   PetscCall(VecGetKokkosViewWrite(yy, &yv));
281   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
282   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
283   PetscCall(VecRestoreKokkosView(xx, &xv));
284   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
285   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
286   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
287   PetscCall(PetscLogGpuTimeEnd());
288   PetscFunctionReturn(PETSC_SUCCESS);
289 }
290 
291 /* y = A^T x */
292 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
293 {
294   Mat_SeqAIJKokkos          *aijkok;
295   const char                *mode;
296   ConstPetscScalarKokkosView xv;
297   PetscScalarKokkosView      yv;
298   KokkosCsrMatrix           *csrmat;
299 
300   PetscFunctionBegin;
301   PetscCall(PetscLogGpuTimeBegin());
302   PetscCall(MatSeqAIJKokkosSyncDevice(A));
303   PetscCall(VecGetKokkosView(xx, &xv));
304   PetscCall(VecGetKokkosViewWrite(yy, &yv));
305   if (A->form_explicit_transpose) {
306     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
307     mode = "N";
308   } else {
309     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
310     csrmat = &aijkok->csrmat;
311     mode   = "T";
312   }
313   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
314   PetscCall(VecRestoreKokkosView(xx, &xv));
315   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
316   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
317   PetscCall(PetscLogGpuTimeEnd());
318   PetscFunctionReturn(PETSC_SUCCESS);
319 }
320 
321 /* y = A^H x */
322 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
323 {
324   Mat_SeqAIJKokkos          *aijkok;
325   const char                *mode;
326   ConstPetscScalarKokkosView xv;
327   PetscScalarKokkosView      yv;
328   KokkosCsrMatrix           *csrmat;
329 
330   PetscFunctionBegin;
331   PetscCall(PetscLogGpuTimeBegin());
332   PetscCall(MatSeqAIJKokkosSyncDevice(A));
333   PetscCall(VecGetKokkosView(xx, &xv));
334   PetscCall(VecGetKokkosViewWrite(yy, &yv));
335   if (A->form_explicit_transpose) {
336     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
337     mode = "N";
338   } else {
339     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
340     csrmat = &aijkok->csrmat;
341     mode   = "C";
342   }
343   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
344   PetscCall(VecRestoreKokkosView(xx, &xv));
345   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
346   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
347   PetscCall(PetscLogGpuTimeEnd());
348   PetscFunctionReturn(PETSC_SUCCESS);
349 }
350 
351 /* z = A x + y */
352 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
353 {
354   Mat_SeqAIJKokkos          *aijkok;
355   ConstPetscScalarKokkosView xv, yv;
356   PetscScalarKokkosView      zv;
357 
358   PetscFunctionBegin;
359   PetscCall(PetscLogGpuTimeBegin());
360   PetscCall(MatSeqAIJKokkosSyncDevice(A));
361   PetscCall(VecGetKokkosView(xx, &xv));
362   PetscCall(VecGetKokkosView(yy, &yv));
363   PetscCall(VecGetKokkosViewWrite(zz, &zv));
364   if (zz != yy) Kokkos::deep_copy(zv, yv);
365   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
366   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
367   PetscCall(VecRestoreKokkosView(xx, &xv));
368   PetscCall(VecRestoreKokkosView(yy, &yv));
369   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
370   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
371   PetscCall(PetscLogGpuTimeEnd());
372   PetscFunctionReturn(PETSC_SUCCESS);
373 }
374 
375 /* z = A^T x + y */
376 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
377 {
378   Mat_SeqAIJKokkos          *aijkok;
379   const char                *mode;
380   ConstPetscScalarKokkosView xv, yv;
381   PetscScalarKokkosView      zv;
382   KokkosCsrMatrix           *csrmat;
383 
384   PetscFunctionBegin;
385   PetscCall(PetscLogGpuTimeBegin());
386   PetscCall(MatSeqAIJKokkosSyncDevice(A));
387   PetscCall(VecGetKokkosView(xx, &xv));
388   PetscCall(VecGetKokkosView(yy, &yv));
389   PetscCall(VecGetKokkosViewWrite(zz, &zv));
390   if (zz != yy) Kokkos::deep_copy(zv, yv);
391   if (A->form_explicit_transpose) {
392     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
393     mode = "N";
394   } else {
395     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
396     csrmat = &aijkok->csrmat;
397     mode   = "T";
398   }
399   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
400   PetscCall(VecRestoreKokkosView(xx, &xv));
401   PetscCall(VecRestoreKokkosView(yy, &yv));
402   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
403   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
404   PetscCall(PetscLogGpuTimeEnd());
405   PetscFunctionReturn(PETSC_SUCCESS);
406 }
407 
408 /* z = A^H x + y */
409 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
410 {
411   Mat_SeqAIJKokkos          *aijkok;
412   const char                *mode;
413   ConstPetscScalarKokkosView xv, yv;
414   PetscScalarKokkosView      zv;
415   KokkosCsrMatrix           *csrmat;
416 
417   PetscFunctionBegin;
418   PetscCall(PetscLogGpuTimeBegin());
419   PetscCall(MatSeqAIJKokkosSyncDevice(A));
420   PetscCall(VecGetKokkosView(xx, &xv));
421   PetscCall(VecGetKokkosView(yy, &yv));
422   PetscCall(VecGetKokkosViewWrite(zz, &zv));
423   if (zz != yy) Kokkos::deep_copy(zv, yv);
424   if (A->form_explicit_transpose) {
425     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
426     mode = "N";
427   } else {
428     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
429     csrmat = &aijkok->csrmat;
430     mode   = "C";
431   }
432   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
433   PetscCall(VecRestoreKokkosView(xx, &xv));
434   PetscCall(VecRestoreKokkosView(yy, &yv));
435   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
436   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
437   PetscCall(PetscLogGpuTimeEnd());
438   PetscFunctionReturn(PETSC_SUCCESS);
439 }
440 
441 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
442 {
443   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
444 
445   PetscFunctionBegin;
446   switch (op) {
447   case MAT_FORM_EXPLICIT_TRANSPOSE:
448     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
449     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
450     A->form_explicit_transpose = flg;
451     break;
452   default:
453     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
454     break;
455   }
456   PetscFunctionReturn(PETSC_SUCCESS);
457 }
458 
459 /* Depending on reuse, either build a new mat, or use the existing mat */
460 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
461 {
462   Mat_SeqAIJ *aseq;
463 
464   PetscFunctionBegin;
465   PetscCall(PetscKokkosInitializeCheck());
466   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
467     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
468   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
469     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
470   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
471     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
472     PetscCall(PetscFree(A->defaultvectype));
473     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
474     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
475     PetscCall(MatSetOps_SeqAIJKokkos(A));
476     aseq = static_cast<Mat_SeqAIJ *>(A->data);
477     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
478       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
479       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
480     }
481   }
482   PetscFunctionReturn(PETSC_SUCCESS);
483 }
484 
485 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
486    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
487  */
488 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
489 {
490   Mat_SeqAIJ       *bseq;
491   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
492   Mat               mat;
493 
494   PetscFunctionBegin;
495   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
496   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
497   mat = *B;
498   if (A->assembled) {
499     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
500     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
501     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
502     /* Now copy values to B if needed */
503     if (dupOption == MAT_COPY_VALUES) {
504       if (akok->a_dual.need_sync_device()) {
505         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
506         bkok->a_dual.modify_host();
507       } else { /* If device has the latest data, we only copy data on device */
508         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
509         bkok->a_dual.modify_device();
510       }
511     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
512       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
513       bkok->a_dual.modify_host();
514     }
515     mat->spptr = bkok;
516   }
517 
518   PetscCall(PetscFree(mat->defaultvectype));
519   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
520   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
521   PetscCall(MatSetOps_SeqAIJKokkos(mat));
522   PetscFunctionReturn(PETSC_SUCCESS);
523 }
524 
525 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
526 {
527   Mat               At;
528   KokkosCsrMatrix  *internT;
529   Mat_SeqAIJKokkos *atkok, *bkok;
530 
531   PetscFunctionBegin;
532   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
533   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
534   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
535     /* Deep copy internT, as we want to isolate the internal transpose */
536     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT)));
537     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
538     if (reuse == MAT_INITIAL_MATRIX) *B = At;
539     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
540   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
541     if ((*B)->assembled) {
542       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
543       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values));
544       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
545     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
546       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
547       MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */
548       MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz());
549       PetscCallCXX(Kokkos::deep_copy(a_h, internT->values));
550       PetscCallCXX(Kokkos::deep_copy(j_h, internT->graph.entries));
551     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
552   }
553   PetscFunctionReturn(PETSC_SUCCESS);
554 }
555 
556 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
557 {
558   Mat_SeqAIJKokkos *aijkok;
559 
560   PetscFunctionBegin;
561   if (A->factortype == MAT_FACTOR_NONE) {
562     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
563     delete aijkok;
564   } else {
565     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
566   }
567   A->spptr = NULL;
568   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
569   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
570   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
571   PetscCall(MatDestroy_SeqAIJ(A));
572   PetscFunctionReturn(PETSC_SUCCESS);
573 }
574 
575 /*MC
576    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
577 
578    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
579 
580    Options Database Keys:
581 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
582 
583   Level: beginner
584 
585 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
586 M*/
587 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
588 {
589   PetscFunctionBegin;
590   PetscCall(PetscKokkosInitializeCheck());
591   PetscCall(MatCreate_SeqAIJ(A));
592   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
593   PetscFunctionReturn(PETSC_SUCCESS);
594 }
595 
596 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
597 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
598 {
599   Mat_SeqAIJ         *a, *b;
600   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
601   MatScalarKokkosView aa, ba, ca;
602   MatRowMapKokkosView ai, bi, ci;
603   MatColIdxKokkosView aj, bj, cj;
604   PetscInt            m, n, nnz, aN;
605 
606   PetscFunctionBegin;
607   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
608   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
609   PetscValidPointer(C, 4);
610   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
611   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
612   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
613   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
614 
615   PetscCall(MatSeqAIJKokkosSyncDevice(A));
616   PetscCall(MatSeqAIJKokkosSyncDevice(B));
617   a    = static_cast<Mat_SeqAIJ *>(A->data);
618   b    = static_cast<Mat_SeqAIJ *>(B->data);
619   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
620   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
621   aa   = akok->a_dual.view_device();
622   ai   = akok->i_dual.view_device();
623   ba   = bkok->a_dual.view_device();
624   bi   = bkok->i_dual.view_device();
625   m    = A->rmap->n; /* M, N and nnz of C */
626   n    = A->cmap->n + B->cmap->n;
627   nnz  = a->nz + b->nz;
628   aN   = A->cmap->n; /* N of A */
629   if (reuse == MAT_INITIAL_MATRIX) {
630     aj           = akok->j_dual.view_device();
631     bj           = bkok->j_dual.view_device();
632     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
633     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
634     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
635     ca           = ca_dual.view_device();
636     ci           = ci_dual.view_device();
637     cj           = cj_dual.view_device();
638 
639     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
640     Kokkos::parallel_for(
641       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
642         PetscInt i       = t.league_rank(); /* row i */
643         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
644 
645         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
646                                                    ci(i) = coffset;
647                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
648         });
649 
650         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
651           if (k < alen) {
652             ca(coffset + k) = aa(ai(i) + k);
653             cj(coffset + k) = aj(ai(i) + k);
654           } else {
655             ca(coffset + k) = ba(bi(i) + k - alen);
656             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
657           }
658         });
659       });
660     ca_dual.modify_device();
661     ci_dual.modify_device();
662     cj_dual.modify_device();
663     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
664     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
665   } else if (reuse == MAT_REUSE_MATRIX) {
666     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
667     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
668     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
669     ca   = ckok->a_dual.view_device();
670     ci   = ckok->i_dual.view_device();
671 
672     Kokkos::parallel_for(
673       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
674         PetscInt i    = t.league_rank(); /* row i */
675         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
676         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
677           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
678           else ca(ci(i) + k) = ba(bi(i) + k - alen);
679         });
680       });
681     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
682   }
683   PetscFunctionReturn(PETSC_SUCCESS);
684 }
685 
686 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
687 {
688   PetscFunctionBegin;
689   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
690   PetscFunctionReturn(PETSC_SUCCESS);
691 }
692 
693 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
694 {
695   Mat_Product                 *product = C->product;
696   Mat                          A, B;
697   bool                         transA, transB; /* use bool, since KK needs this type */
698   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
699   Mat_SeqAIJ                  *c;
700   MatProductData_SeqAIJKokkos *pdata;
701   KokkosCsrMatrix             *csrmatA, *csrmatB;
702 
703   PetscFunctionBegin;
704   MatCheckProduct(C, 1);
705   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
706   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
707 
708   if (pdata->reusesym) {           /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
709     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
710     PetscFunctionReturn(PETSC_SUCCESS);
711   }
712 
713   switch (product->type) {
714   case MATPRODUCT_AB:
715     transA = false;
716     transB = false;
717     break;
718   case MATPRODUCT_AtB:
719     transA = true;
720     transB = false;
721     break;
722   case MATPRODUCT_ABt:
723     transA = false;
724     transB = true;
725     break;
726   default:
727     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
728   }
729 
730   A = product->A;
731   B = product->B;
732   PetscCall(MatSeqAIJKokkosSyncDevice(A));
733   PetscCall(MatSeqAIJKokkosSyncDevice(B));
734   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
735   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
736   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
737 
738   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
739 
740   csrmatA = &akok->csrmat;
741   csrmatB = &bkok->csrmat;
742 
743   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
744   if (transA) {
745     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
746     transA = false;
747   }
748 
749   if (transB) {
750     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
751     transB = false;
752   }
753   PetscCall(PetscLogGpuTimeBegin());
754   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat));
755 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(3, 7, 99)
756   auto spgemmHandle = pdata->kh.get_spgemm_handle();
757   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
758 #endif
759 
760   PetscCall(PetscLogGpuTimeEnd());
761   PetscCall(MatSeqAIJKokkosModifyDevice(C));
762   /* shorter version of MatAssemblyEnd_SeqAIJ */
763   c = (Mat_SeqAIJ *)C->data;
764   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
765   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
766   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
767   c->reallocs         = 0;
768   C->info.mallocs     = 0;
769   C->info.nz_unneeded = 0;
770   C->assembled = C->was_assembled = PETSC_TRUE;
771   C->num_ass++;
772   PetscFunctionReturn(PETSC_SUCCESS);
773 }
774 
775 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
776 {
777   Mat_Product                 *product = C->product;
778   MatProductType               ptype;
779   Mat                          A, B;
780   bool                         transA, transB;
781   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
782   MatProductData_SeqAIJKokkos *pdata;
783   MPI_Comm                     comm;
784   KokkosCsrMatrix             *csrmatA, *csrmatB, csrmatC;
785 
786   PetscFunctionBegin;
787   MatCheckProduct(C, 1);
788   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
789   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
790   A = product->A;
791   B = product->B;
792   PetscCall(MatSeqAIJKokkosSyncDevice(A));
793   PetscCall(MatSeqAIJKokkosSyncDevice(B));
794   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
795   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
796   csrmatA = &akok->csrmat;
797   csrmatB = &bkok->csrmat;
798 
799   ptype = product->type;
800   switch (ptype) {
801   case MATPRODUCT_AB:
802     transA = false;
803     transB = false;
804     break;
805   case MATPRODUCT_AtB:
806     transA = true;
807     transB = false;
808     break;
809   case MATPRODUCT_ABt:
810     transA = false;
811     transB = true;
812     break;
813   default:
814     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
815   }
816 
817   product->data = pdata = new MatProductData_SeqAIJKokkos();
818   pdata->kh.set_team_work_size(16);
819   pdata->kh.set_dynamic_scheduling(true);
820   pdata->reusesym = product->api_user;
821 
822   /* TODO: add command line options to select spgemm algorithms */
823   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
824 
825   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
826 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
827   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
828   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
829   #endif
830 #endif
831 
832   pdata->kh.create_spgemm_handle(spgemm_alg);
833 
834   PetscCall(PetscLogGpuTimeBegin());
835   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
836   if (transA) {
837     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
838     transA = false;
839   }
840 
841   if (transB) {
842     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
843     transB = false;
844   }
845 
846   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC));
847 
848   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
849     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
850     calling new Mat_SeqAIJKokkos().
851     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
852   */
853   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC));
854 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(3, 7, 99)
855   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
856   auto spgemmHandle = pdata->kh.get_spgemm_handle();
857   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
858 #endif
859   PetscCall(PetscLogGpuTimeEnd());
860 
861   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
862   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
863   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
864   PetscFunctionReturn(PETSC_SUCCESS);
865 }
866 
867 /* handles sparse matrix matrix ops */
868 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
869 {
870   Mat_Product *product = mat->product;
871   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
872 
873   PetscFunctionBegin;
874   MatCheckProduct(mat, 1);
875   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
876   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
877   if (Biskok && Ciskok) {
878     switch (product->type) {
879     case MATPRODUCT_AB:
880     case MATPRODUCT_AtB:
881     case MATPRODUCT_ABt:
882       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
883       break;
884     case MATPRODUCT_PtAP:
885     case MATPRODUCT_RARt:
886     case MATPRODUCT_ABC:
887       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
888       break;
889     default:
890       break;
891     }
892   } else { /* fallback for AIJ */
893     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
894   }
895   PetscFunctionReturn(PETSC_SUCCESS);
896 }
897 
898 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
899 {
900   Mat_SeqAIJKokkos *aijkok;
901 
902   PetscFunctionBegin;
903   PetscCall(PetscLogGpuTimeBegin());
904   PetscCall(MatSeqAIJKokkosSyncDevice(A));
905   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
906   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
907   PetscCall(MatSeqAIJKokkosModifyDevice(A));
908   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
909   PetscCall(PetscLogGpuTimeEnd());
910   PetscFunctionReturn(PETSC_SUCCESS);
911 }
912 
913 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
914 {
915   Mat_SeqAIJKokkos *aijkok;
916 
917   PetscFunctionBegin;
918   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
919   if (aijkok) { /* Only zero the device if data is already there */
920     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
921     PetscCall(MatSeqAIJKokkosModifyDevice(A));
922   } else { /* Might be preallocated but not assembled */
923     PetscCall(MatZeroEntries_SeqAIJ(A));
924   }
925   PetscFunctionReturn(PETSC_SUCCESS);
926 }
927 
928 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
929 {
930   Mat_SeqAIJ           *aijseq;
931   Mat_SeqAIJKokkos     *aijkok;
932   PetscInt              n;
933   PetscScalarKokkosView xv;
934 
935   PetscFunctionBegin;
936   PetscCall(VecGetLocalSize(x, &n));
937   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
938   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
939 
940   PetscCall(MatSeqAIJKokkosSyncDevice(A));
941   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
942 
943   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
944     PetscCall(MatMarkDiagonal_SeqAIJ(A));
945     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
946     aijkok->SetDiagonal(aijseq->diag);
947   }
948 
949   const auto &Aa    = aijkok->a_dual.view_device();
950   const auto &Ai    = aijkok->i_dual.view_device();
951   const auto &Adiag = aijkok->diag_dual.view_device();
952 
953   PetscCall(VecGetKokkosViewWrite(x, &xv));
954   Kokkos::parallel_for(
955     n, KOKKOS_LAMBDA(const PetscInt i) {
956       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
957       else xv(i) = 0;
958     });
959   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
960   PetscFunctionReturn(PETSC_SUCCESS);
961 }
962 
963 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
964 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
965 {
966   Mat_SeqAIJKokkos *aijkok;
967 
968   PetscFunctionBegin;
969   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
970   PetscValidPointer(kv, 2);
971   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
972   PetscCall(MatSeqAIJKokkosSyncDevice(A));
973   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
974   *kv    = aijkok->a_dual.view_device();
975   PetscFunctionReturn(PETSC_SUCCESS);
976 }
977 
978 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
979 {
980   PetscFunctionBegin;
981   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
982   PetscValidPointer(kv, 2);
983   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
984   PetscFunctionReturn(PETSC_SUCCESS);
985 }
986 
987 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
988 {
989   Mat_SeqAIJKokkos *aijkok;
990 
991   PetscFunctionBegin;
992   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
993   PetscValidPointer(kv, 2);
994   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
995   PetscCall(MatSeqAIJKokkosSyncDevice(A));
996   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
997   *kv    = aijkok->a_dual.view_device();
998   PetscFunctionReturn(PETSC_SUCCESS);
999 }
1000 
1001 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1002 {
1003   PetscFunctionBegin;
1004   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1005   PetscValidPointer(kv, 2);
1006   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1007   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1008   PetscFunctionReturn(PETSC_SUCCESS);
1009 }
1010 
1011 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1012 {
1013   Mat_SeqAIJKokkos *aijkok;
1014 
1015   PetscFunctionBegin;
1016   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1017   PetscValidPointer(kv, 2);
1018   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1019   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1020   *kv    = aijkok->a_dual.view_device();
1021   PetscFunctionReturn(PETSC_SUCCESS);
1022 }
1023 
1024 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1025 {
1026   PetscFunctionBegin;
1027   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1028   PetscValidPointer(kv, 2);
1029   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1030   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1031   PetscFunctionReturn(PETSC_SUCCESS);
1032 }
1033 
1034 /* Computes Y += alpha X */
1035 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1036 {
1037   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1038   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1039   ConstMatScalarKokkosView Xa;
1040   MatScalarKokkosView      Ya;
1041 
1042   PetscFunctionBegin;
1043   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1044   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1045   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1046   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1047   PetscCall(PetscLogGpuTimeBegin());
1048 
1049   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1050     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
1051     PetscBool e;
1052     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1053     if (e) {
1054       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1055       if (e) pattern = SAME_NONZERO_PATTERN;
1056     }
1057   }
1058 
1059   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1060     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1061     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1062     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1063   */
1064   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1065   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1066   Xa   = xkok->a_dual.view_device();
1067   Ya   = ykok->a_dual.view_device();
1068 
1069   if (pattern == SAME_NONZERO_PATTERN) {
1070     KokkosBlas::axpy(alpha, Xa, Ya);
1071     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1072   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1073     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1074     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1075 
1076     Kokkos::parallel_for(
1077       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1078         PetscInt i = t.league_rank();              /* row i */
1079         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */
1080                                                    PetscInt p, q = Yi(i);
1081                                                    for (p = Xi(i); p < Xi(i + 1); p++) {          /* For each nonzero on row i of X */
1082                                                      while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */
1083                                                      if (Xj(p) == Yj(q)) {                        /* Found it */
1084                                                        Ya(q) += alpha * Xa(p);
1085                                                        q++;
1086                                                      } else {
1087                                                        /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1088                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1089             */
1090                                                        if (Yi(i) != Yi(i + 1))
1091                                                          Ya(Yi(i)) =
1092 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99)
1093                                                            Kokkos::ArithTraits<PetscScalar>::nan();
1094 #else
1095               Kokkos::Experimental::nan("1");
1096 #endif
1097                                                      }
1098                                                    }
1099         });
1100       });
1101     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1102   } else { /* different nonzero patterns */
1103     Mat             Z;
1104     KokkosCsrMatrix zcsr;
1105     KernelHandle    kh;
1106     kh.create_spadd_handle(false);
1107     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1108     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1109     zkok = new Mat_SeqAIJKokkos(zcsr);
1110     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1111     PetscCall(MatHeaderReplace(Y, &Z));
1112     kh.destroy_spadd_handle();
1113   }
1114   PetscCall(PetscLogGpuTimeEnd());
1115   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); /* Because we scaled X and then added it to Y */
1116   PetscFunctionReturn(PETSC_SUCCESS);
1117 }
1118 
1119 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1120 {
1121   Mat_SeqAIJKokkos *akok;
1122   Mat_SeqAIJ       *aseq;
1123 
1124   PetscFunctionBegin;
1125   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1126   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1127   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1128   delete akok;
1129   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1130   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1131   akok->SetUpCOO(aseq);
1132   PetscFunctionReturn(PETSC_SUCCESS);
1133 }
1134 
1135 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1136 {
1137   Mat_SeqAIJ                 *aseq = static_cast<Mat_SeqAIJ *>(A->data);
1138   Mat_SeqAIJKokkos           *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1139   PetscCount                  Annz = aseq->nz;
1140   const PetscCountKokkosView &jmap = akok->jmap_d;
1141   const PetscCountKokkosView &perm = akok->perm_d;
1142   MatScalarKokkosView         Aa;
1143   ConstMatScalarKokkosView    kv;
1144   PetscMemType                memtype;
1145 
1146   PetscFunctionBegin;
1147   PetscCall(PetscGetMemType(v, &memtype));
1148   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1149     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
1150   } else {
1151     kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
1152   }
1153 
1154   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1155   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1156 
1157   Kokkos::parallel_for(
1158     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1159       PetscScalar sum = 0.0;
1160       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1161       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1162     });
1163 
1164   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1165   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1166   PetscFunctionReturn(PETSC_SUCCESS);
1167 }
1168 
1169 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag)
1170 {
1171   Mat_SeqAIJKokkos          *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1172   MatScalarKokkosView        Aa;
1173   const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
1174   PetscInt                   m  = A->rmap->n;
1175   ConstMatRowMapKokkosView   Adiag(diag, m); /* diag is a device pointer */
1176 
1177   PetscFunctionBegin;
1178   PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa));
1179   Kokkos::parallel_for(
1180     m, KOKKOS_LAMBDA(const PetscInt i) {
1181       PetscScalar tmp;
1182       if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
1183         tmp          = Aa(Ai(i));
1184         Aa(Ai(i))    = Aa(Adiag(i));
1185         Aa(Adiag(i)) = tmp;
1186       }
1187     });
1188   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1189   PetscFunctionReturn(PETSC_SUCCESS);
1190 }
1191 
1192 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1193 {
1194   PetscFunctionBegin;
1195   PetscCall(MatSeqAIJKokkosSyncHost(A));
1196   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1197   B->offloadmask = PETSC_OFFLOAD_CPU;
1198   PetscFunctionReturn(PETSC_SUCCESS);
1199 }
1200 
1201 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1202 {
1203   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1204 
1205   PetscFunctionBegin;
1206   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1207   A->boundtocpu  = PETSC_FALSE;
1208 
1209   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1210   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1211   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1212   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1213   A->ops->scale                     = MatScale_SeqAIJKokkos;
1214   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1215   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1216   A->ops->mult                      = MatMult_SeqAIJKokkos;
1217   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1218   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1219   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1220   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1221   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1222   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1223   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1224   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1225   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1226   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1227   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1228   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1229   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1230   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1231   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1232   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1233 
1234   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1235   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1236   PetscFunctionReturn(PETSC_SUCCESS);
1237 }
1238 
1239 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1240 {
1241   Mat_SeqAIJ *aseq;
1242   PetscInt    i, m, n;
1243 
1244   PetscFunctionBegin;
1245   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1246 
1247   m = akok->nrows();
1248   n = akok->ncols();
1249   PetscCall(MatSetSizes(A, m, n, m, n));
1250   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1251 
1252   /* Set up data structures of A as a MATSEQAIJ */
1253   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1254   aseq = (Mat_SeqAIJ *)(A)->data;
1255 
1256   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1257   akok->j_dual.sync_host();
1258 
1259   aseq->i            = akok->i_host_data();
1260   aseq->j            = akok->j_host_data();
1261   aseq->a            = akok->a_host_data();
1262   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1263   aseq->singlemalloc = PETSC_FALSE;
1264   aseq->free_a       = PETSC_FALSE;
1265   aseq->free_ij      = PETSC_FALSE;
1266   aseq->nz           = akok->nnz();
1267   aseq->maxnz        = aseq->nz;
1268 
1269   PetscCall(PetscMalloc1(m, &aseq->imax));
1270   PetscCall(PetscMalloc1(m, &aseq->ilen));
1271   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1272 
1273   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1274   akok->nonzerostate = A->nonzerostate;
1275   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1276   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1277   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1278   PetscFunctionReturn(PETSC_SUCCESS);
1279 }
1280 
1281 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1282 
1283    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1284  */
1285 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1286 {
1287   PetscFunctionBegin;
1288   PetscCall(MatCreate(comm, A));
1289   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1290   PetscFunctionReturn(PETSC_SUCCESS);
1291 }
1292 
1293 /* --------------------------------------------------------------------------------*/
1294 /*@C
1295    MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1296    (the default parallel PETSc format). This matrix will ultimately be handled by
1297    Kokkos for calculations. For good matrix
1298    assembly performance the user should preallocate the matrix storage by setting
1299    the parameter nz (or the array nnz).  By setting these parameters accurately,
1300    performance during matrix assembly can be increased by more than a factor of 50.
1301 
1302    Collective
1303 
1304    Input Parameters:
1305 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
1306 .  m - number of rows
1307 .  n - number of columns
1308 .  nz - number of nonzeros per row (same for all rows)
1309 -  nnz - array containing the number of nonzeros in the various rows
1310          (possibly different for each row) or NULL
1311 
1312    Output Parameter:
1313 .  A - the matrix
1314 
1315    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1316    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1317    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1318 
1319    Notes:
1320    If nnz is given then nz is ignored
1321 
1322    The AIJ format, also called
1323    compressed row storage, is fully compatible with standard Fortran 77
1324    storage.  That is, the stored row and column indices can begin at
1325    either one (as in Fortran) or zero.  See the users' manual for details.
1326 
1327    Specify the preallocated storage with either nz or nnz (not both).
1328    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
1329    allocation.  For large problems you MUST preallocate memory or you
1330    will get TERRIBLE performance, see the users' manual chapter on matrices.
1331 
1332    By default, this format uses inodes (identical nodes) when possible, to
1333    improve numerical efficiency of matrix-vector products and solves. We
1334    search for consecutive rows with the same nonzero structure, thereby
1335    reusing matrix information to achieve increased efficiency.
1336 
1337    Level: intermediate
1338 
1339 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1340 @*/
1341 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1342 {
1343   PetscFunctionBegin;
1344   PetscCall(PetscKokkosInitializeCheck());
1345   PetscCall(MatCreate(comm, A));
1346   PetscCall(MatSetSizes(*A, m, n, m, n));
1347   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1348   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1349   PetscFunctionReturn(PETSC_SUCCESS);
1350 }
1351 
1352 typedef Kokkos::TeamPolicy<>::member_type team_member;
1353 //
1354 // This factorization exploits block diagonal matrices with "Nf" (not used).
1355 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1356 //
1357 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info)
1358 {
1359   Mat_SeqAIJ       *b      = (Mat_SeqAIJ *)B->data;
1360   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1361   IS                isrow = b->row, isicol = b->icol;
1362   const PetscInt   *r_h, *ic_h;
1363   const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1364   const PetscScalar *aa_d = aijkok->a_dual.view_device().data();
1365   PetscScalar       *ba_d = baijkok->a_dual.view_device().data();
1366   PetscBool          row_identity, col_identity;
1367   PetscInt           nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used
1368 
1369   PetscFunctionBegin;
1370   PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n);
1371   PetscCall(MatIsStructurallySymmetric(A, &row_identity));
1372   PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported");
1373   PetscCall(ISGetIndices(isrow, &r_h));
1374   PetscCall(ISGetIndices(isicol, &ic_h));
1375   PetscCall(ISGetSize(isicol, &nc));
1376   PetscCall(PetscLogGpuTimeBegin());
1377   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1378   {
1379 #define KOKKOS_SHARED_LEVEL 1
1380     using scr_mem_t    = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1381     using sizet_scr_t  = Kokkos::View<size_t, scr_mem_t>;
1382     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1383     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n);
1384     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_r_k("r", n);
1385     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc);
1386     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_ic_k("ic", nc);
1387     size_t                                                                                                               flops_h = 0.0;
1388     Kokkos::View<size_t, Kokkos::HostSpace>                                                                              h_flops_k(&flops_h);
1389     Kokkos::View<size_t>                                                                                                 d_flops_k("flops");
1390     const int                                                                                                            conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1391     const int                                                                                                            nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1;
1392     Kokkos::deep_copy(d_flops_k, h_flops_k);
1393     Kokkos::deep_copy(d_r_k, h_r_k);
1394     Kokkos::deep_copy(d_ic_k, h_ic_k);
1395     // Fill A --> fact
1396     Kokkos::parallel_for(
1397       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) {
1398         const PetscInt  field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA
1399         const PetscInt  nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
1400         const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data();
1401         // zero rows of B
1402         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1403           PetscInt     nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag
1404           PetscScalar *baL = ba_d + bi_d[rowb];
1405           PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1;
1406           /* zero (unfactored row) */
1407           for (int j = 0; j < nzbL; j++) baL[j] = 0;
1408           for (int j = 0; j < nzbU; j++) baU[j] = 0;
1409         });
1410         // copy A into B
1411         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
1412           PetscInt           rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
1413           const PetscScalar *av    = aa_d + ai_d[rowa];
1414           const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1415           /* load in initial (unfactored row) */
1416           for (int j = 0; j < nza; j++) {
1417             PetscInt    colb = ic[ajtmp[j]];
1418             PetscScalar vala = av[j];
1419             if (colb == rowb) {
1420               *(ba_d + bdiag_d[rowb]) = vala;
1421             } else {
1422               const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1423               PetscScalar    *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
1424               PetscInt        nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0;
1425               for (int j = 0; j < nz; j++) {
1426                 if (pbj[j] == colb) {
1427                   pba[j] = vala;
1428                   set++;
1429                   break;
1430                 }
1431               }
1432 #if !defined(PETSC_HAVE_SYCL)
1433               if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1434 #endif
1435             }
1436           }
1437         });
1438       });
1439     Kokkos::fence();
1440 
1441     Kokkos::parallel_for(
1442       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) {
1443         sizet_scr_t    colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1444         scalar_scr_t   L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1445         sizet_scr_t    flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1446         const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA
1447         const PetscInt start = field * nloc, end = start + nloc;
1448         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1449         // A22 panel update for each row A(1,:) and col A(:,1)
1450         for (int ii = start; ii < end - 1; ii++) {
1451           const PetscInt    *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end)
1452           const PetscScalar *baUi    = ba_d + bdiag_d[ii + 1] + 1;                                          // vector of data  U(i,i+1:end)
1453           const PetscInt     nUi_its = nzUi / Ni + !!(nzUi % Ni);
1454           const PetscScalar  Bii     = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1455           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) {
1456             PetscInt kIdx = j * Ni + field_block_idx;
1457             if (kIdx >= nzUi) /* void */
1458               ;
1459             else {
1460               const PetscInt  myk = bjUi[kIdx];                // assume symmetric structure, need a transposed meta-data here in general
1461               const PetscInt *pjL = bj_d + bi_d[myk];          // look for L(myk,ii) in start of row
1462               const PetscInt  nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:)
1463               size_t          st_idx;
1464               // find and do L(k,i) = A(:k,i) / A(i,i)
1465               Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1466               // get column, there has got to be a better way
1467               Kokkos::parallel_reduce(
1468                 Kokkos::ThreadVectorRange(team, nzL),
1469                 [&](const int &j, size_t &idx) {
1470                   if (pjL[j] == ii) {
1471                     PetscScalar *pLki = ba_d + bi_d[myk] + j;
1472                     idx               = j;           // output
1473                     *pLki             = *pLki / Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1474                   }
1475                 },
1476                 st_idx);
1477               Kokkos::single(Kokkos::PerThread(team), [=]() {
1478                 colkIdx() = st_idx;
1479                 L_ki()    = *(ba_d + bi_d[myk] + st_idx);
1480               });
1481 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1482               if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register
1483 #endif
1484               // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1485               // U(i+1,:end)
1486               Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U)
1487                 PetscScalar Uij = baUi[uiIdx];
1488                 PetscInt    col = bjUi[uiIdx];
1489                 if (col == myk) {
1490                   // A_kk = A_kk - L_ki * U_ij(k)
1491                   PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1492                   *Akkv             = *Akkv - L_ki() * Uij;  // UiK
1493                 } else {
1494                   PetscScalar    *start, *end, *pAkjv = NULL;
1495                   PetscInt        high, low;
1496                   const PetscInt *startj;
1497                   if (col < myk) { // L
1498                     PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1499                     PetscInt     idx  = (pLki + 1) - (ba_d + bi_d[myk]); // index into row
1500                     start             = pLki + 1;                        // start at pLki+1, A22(myk,1)
1501                     startj            = bj_d + bi_d[myk] + idx;
1502                     end               = ba_d + bi_d[myk + 1];
1503                   } else {
1504                     PetscInt idx = bdiag_d[myk + 1] + 1;
1505                     start        = ba_d + idx;
1506                     startj       = bj_d + idx;
1507                     end          = ba_d + bdiag_d[myk];
1508                   }
1509                   // search for 'col', use bisection search - TODO
1510                   low  = 0;
1511                   high = (PetscInt)(end - start);
1512                   while (high - low > 5) {
1513                     int t = (low + high) / 2;
1514                     if (startj[t] > col) high = t;
1515                     else low = t;
1516                   }
1517                   for (pAkjv = start + low; pAkjv < start + high; pAkjv++) {
1518                     if (startj[pAkjv - start] == col) break;
1519                   }
1520 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1521                   if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register
1522 #endif
1523                   *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1524                 }
1525               });
1526             }
1527           });
1528           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1529           if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); });
1530         } /* endof for (i=0; i<n; i++) { */
1531         Kokkos::single(Kokkos::PerTeam(team), [=]() {
1532           Kokkos::atomic_add(&d_flops_k(), flops());
1533           flops() = 0;
1534         });
1535       });
1536     Kokkos::fence();
1537     Kokkos::deep_copy(h_flops_k, d_flops_k);
1538     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1539     Kokkos::parallel_for(
1540       Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) {
1541         const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni;                            //, field_offset = lg_rank%Ni;
1542         const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters
1543         /* Invert diagonal for simpler triangular solves */
1544         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) {
1545           int i = start + outer_index * Ni + lg_rank % Ni;
1546           if (i < end) {
1547             PetscScalar *pv = ba_d + bdiag_d[i];
1548             *pv             = 1.0 / (*pv);
1549           }
1550         });
1551       });
1552   }
1553   PetscCall(PetscLogGpuTimeEnd());
1554   PetscCall(ISRestoreIndices(isicol, &ic_h));
1555   PetscCall(ISRestoreIndices(isrow, &r_h));
1556 
1557   PetscCall(ISIdentity(isrow, &row_identity));
1558   PetscCall(ISIdentity(isicol, &col_identity));
1559   if (b->inode.size) {
1560     B->ops->solve = MatSolve_SeqAIJ_Inode;
1561   } else if (row_identity && col_identity) {
1562     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1563   } else {
1564     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1565   }
1566   B->offloadmask = PETSC_OFFLOAD_GPU;
1567   PetscCall(MatSeqAIJKokkosSyncHost(B));          // solve on CPU
1568   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1569   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1570   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1571   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1572   B->assembled              = PETSC_TRUE;
1573   B->preallocated           = PETSC_TRUE;
1574 
1575   PetscFunctionReturn(PETSC_SUCCESS);
1576 }
1577 
1578 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1579 {
1580   PetscFunctionBegin;
1581   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1582   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1583   PetscFunctionReturn(PETSC_SUCCESS);
1584 }
1585 
1586 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1587 {
1588   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1589 
1590   PetscFunctionBegin;
1591   if (!factors->sptrsv_symbolic_completed) {
1592     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1593     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1594     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1595   }
1596   PetscFunctionReturn(PETSC_SUCCESS);
1597 }
1598 
1599 /* Check if we need to update factors etc for transpose solve */
1600 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1601 {
1602   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1603   MatColIdxType               n       = A->rmap->n;
1604 
1605   PetscFunctionBegin;
1606   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1607     /* Update L^T and do sptrsv symbolic */
1608     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1);
1609     Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */
1610     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0));
1611     factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0));
1612 
1613     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1614                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1615 
1616     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1617       We have to sort the indices, until KK provides finer control options.
1618     */
1619     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1620 
1621     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1622 
1623     /* Update U^T and do sptrsv symbolic */
1624     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1);
1625     Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */
1626     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0));
1627     factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0));
1628 
1629     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1630                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1631 
1632     /* Sort indices. See comments above */
1633     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1634 
1635     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1636     factors->transpose_updated = PETSC_TRUE;
1637   }
1638   PetscFunctionReturn(PETSC_SUCCESS);
1639 }
1640 
1641 /* Solve Ax = b, with A = LU */
1642 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1643 {
1644   ConstPetscScalarKokkosView  bv;
1645   PetscScalarKokkosView       xv;
1646   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1647 
1648   PetscFunctionBegin;
1649   PetscCall(PetscLogGpuTimeBegin());
1650   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1651   PetscCall(VecGetKokkosView(b, &bv));
1652   PetscCall(VecGetKokkosViewWrite(x, &xv));
1653   /* Solve L tmpv = b */
1654   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1655   /* Solve Ux = tmpv */
1656   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1657   PetscCall(VecRestoreKokkosView(b, &bv));
1658   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1659   PetscCall(PetscLogGpuTimeEnd());
1660   PetscFunctionReturn(PETSC_SUCCESS);
1661 }
1662 
1663 /* Solve A^T x = b, where A^T = U^T L^T */
1664 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1665 {
1666   ConstPetscScalarKokkosView  bv;
1667   PetscScalarKokkosView       xv;
1668   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1669 
1670   PetscFunctionBegin;
1671   PetscCall(PetscLogGpuTimeBegin());
1672   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1673   PetscCall(VecGetKokkosView(b, &bv));
1674   PetscCall(VecGetKokkosViewWrite(x, &xv));
1675   /* Solve U^T tmpv = b */
1676   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1677 
1678   /* Solve L^T x = tmpv */
1679   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1680   PetscCall(VecRestoreKokkosView(b, &bv));
1681   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1682   PetscCall(PetscLogGpuTimeEnd());
1683   PetscFunctionReturn(PETSC_SUCCESS);
1684 }
1685 
1686 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1687 {
1688   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1689   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1690   PetscInt                    fill_lev = info->levels;
1691 
1692   PetscFunctionBegin;
1693   PetscCall(PetscLogGpuTimeBegin());
1694   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1695 
1696   auto a_d = aijkok->a_dual.view_device();
1697   auto i_d = aijkok->i_dual.view_device();
1698   auto j_d = aijkok->j_dual.view_device();
1699 
1700   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1701 
1702   B->assembled              = PETSC_TRUE;
1703   B->preallocated           = PETSC_TRUE;
1704   B->ops->solve             = MatSolve_SeqAIJKokkos;
1705   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1706   B->ops->matsolve          = NULL;
1707   B->ops->matsolvetranspose = NULL;
1708   B->offloadmask            = PETSC_OFFLOAD_GPU;
1709 
1710   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1711   factors->transpose_updated         = PETSC_FALSE;
1712   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1713   /* TODO: log flops, but how to know that? */
1714   PetscCall(PetscLogGpuTimeEnd());
1715   PetscFunctionReturn(PETSC_SUCCESS);
1716 }
1717 
1718 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1719 {
1720   Mat_SeqAIJKokkos           *aijkok;
1721   Mat_SeqAIJ                 *b;
1722   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1723   PetscInt                    fill_lev = info->levels;
1724   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1725   PetscInt                    n        = A->rmap->n;
1726 
1727   PetscFunctionBegin;
1728   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1729   /* Rebuild factors */
1730   if (factors) {
1731     factors->Destroy();
1732   } /* Destroy the old if it exists */
1733   else {
1734     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1735   }
1736 
1737   /* Create a spiluk handle and then do symbolic factorization */
1738   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1739   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1740 
1741   auto spiluk_handle = factors->kh.get_spiluk_handle();
1742 
1743   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1744   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1745   Kokkos::realloc(factors->iU_d, n + 1);
1746   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1747 
1748   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1749   auto i_d = aijkok->i_dual.view_device();
1750   auto j_d = aijkok->j_dual.view_device();
1751   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1752   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1753 
1754   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1755   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1756   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1757   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1758 
1759   /* TODO: add options to select sptrsv algorithms */
1760   /* Create sptrsv handles for L, U and their transpose */
1761 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1762   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1763 #else
1764   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1765 #endif
1766 
1767   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1768   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1769   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1770   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1771 
1772   /* Fill fields of the factor matrix B */
1773   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1774   b     = (Mat_SeqAIJ *)B->data;
1775   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1776   B->info.fill_ratio_given  = info->fill;
1777   B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA);
1778 
1779   B->offloadmask          = PETSC_OFFLOAD_GPU;
1780   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1781   PetscFunctionReturn(PETSC_SUCCESS);
1782 }
1783 
1784 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1785 {
1786   Mat_SeqAIJ    *b     = (Mat_SeqAIJ *)B->data;
1787   const PetscInt nrows = A->rmap->n;
1788 
1789   PetscFunctionBegin;
1790   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1791   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1792   // move B data into Kokkos
1793   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1794   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1795   {
1796     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1797     if (!baijkok->diag_d.extent(0)) {
1798       const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1);
1799       baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag));
1800       Kokkos::deep_copy(baijkok->diag_d, h_diag);
1801     }
1802   }
1803   PetscFunctionReturn(PETSC_SUCCESS);
1804 }
1805 
1806 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1807 {
1808   PetscFunctionBegin;
1809   *type = MATSOLVERKOKKOS;
1810   PetscFunctionReturn(PETSC_SUCCESS);
1811 }
1812 
1813 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type)
1814 {
1815   PetscFunctionBegin;
1816   *type = MATSOLVERKOKKOSDEVICE;
1817   PetscFunctionReturn(PETSC_SUCCESS);
1818 }
1819 
1820 /*MC
1821   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1822   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1823 
1824   Level: beginner
1825 
1826 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1827 M*/
1828 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1829 {
1830   PetscInt n = A->rmap->n;
1831 
1832   PetscFunctionBegin;
1833   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1834   PetscCall(MatSetSizes(*B, n, n, n, n));
1835   (*B)->factortype = ftype;
1836   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1837   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1838 
1839   if (ftype == MAT_FACTOR_LU) {
1840     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1841     (*B)->canuseordering        = PETSC_TRUE;
1842     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1843   } else if (ftype == MAT_FACTOR_ILU) {
1844     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1845     (*B)->canuseordering         = PETSC_FALSE;
1846     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1847   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1848 
1849   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1850   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1851   PetscFunctionReturn(PETSC_SUCCESS);
1852 }
1853 
1854 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B)
1855 {
1856   PetscInt n = A->rmap->n;
1857 
1858   PetscFunctionBegin;
1859   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1860   PetscCall(MatSetSizes(*B, n, n, n, n));
1861   (*B)->factortype     = ftype;
1862   (*B)->canuseordering = PETSC_TRUE;
1863   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1864   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1865 
1866   if (ftype == MAT_FACTOR_LU) {
1867     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1868     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1869   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types");
1870 
1871   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1872   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device));
1873   PetscFunctionReturn(PETSC_SUCCESS);
1874 }
1875 
1876 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1877 {
1878   PetscFunctionBegin;
1879   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1880   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1881   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device));
1882   PetscFunctionReturn(PETSC_SUCCESS);
1883 }
1884 
1885 /* Utility to print out a KokkosCsrMatrix for debugging */
1886 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1887 {
1888   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1889   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1890   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1891   const PetscInt    *i  = iv.data();
1892   const PetscInt    *j  = jv.data();
1893   const PetscScalar *a  = av.data();
1894   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1895 
1896   PetscFunctionBegin;
1897   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1898   for (PetscInt k = 0; k < m; k++) {
1899     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1900     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1901     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1902   }
1903   PetscFunctionReturn(PETSC_SUCCESS);
1904 }
1905