xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision bcee047adeeb73090d7e36cc71e39fc287cdbb97)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
18 
19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
20   #include <KokkosSparse_Utils.hpp>
21 using KokkosSparse::sort_crs_matrix;
22 using KokkosSparse::Impl::transpose_matrix;
23 #else
24   #include <KokkosKernels_Sorting.hpp>
25 using KokkosKernels::sort_crs_matrix;
26 using KokkosKernels::Impl::transpose_matrix;
27 #endif
28 
29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
30 
31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33    In the latter case, it is important to set a_dual's sync state correctly.
34  */
35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
36 {
37   Mat_SeqAIJ       *aijseq;
38   Mat_SeqAIJKokkos *aijkok;
39 
40   PetscFunctionBegin;
41   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
42   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
43 
44   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
45   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
46 
47   /* If aijkok does not exist, we just copy i, j to device.
48      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
49      In both cases, we build a new aijkok structure.
50   */
51   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
52     delete aijkok;
53     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
54     A->spptr = aijkok;
55   }
56 
57   PetscFunctionReturn(PETSC_SUCCESS);
58 }
59 
60 /* Sync CSR data to device if not yet */
61 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
62 {
63   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
64 
65   PetscFunctionBegin;
66   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
67   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
68   if (aijkok->a_dual.need_sync_device()) {
69     aijkok->a_dual.sync_device();
70     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
71     aijkok->hermitian_updated = PETSC_FALSE;
72   }
73   PetscFunctionReturn(PETSC_SUCCESS);
74 }
75 
76 /* Mark the CSR data on device as modified */
77 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
78 {
79   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
80 
81   PetscFunctionBegin;
82   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
83   aijkok->a_dual.clear_sync_state();
84   aijkok->a_dual.modify_device();
85   aijkok->transpose_updated = PETSC_FALSE;
86   aijkok->hermitian_updated = PETSC_FALSE;
87   PetscCall(MatSeqAIJInvalidateDiagonal(A));
88   PetscCall(PetscObjectStateIncrease((PetscObject)A));
89   PetscFunctionReturn(PETSC_SUCCESS);
90 }
91 
92 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
93 {
94   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
95 
96   PetscFunctionBegin;
97   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
98   /* We do not expect one needs factors on host  */
99   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
100   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
101   aijkok->a_dual.sync_host();
102   PetscFunctionReturn(PETSC_SUCCESS);
103 }
104 
105 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
106 {
107   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
108 
109   PetscFunctionBegin;
110   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
111     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
112     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
113     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
114   */
115   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
116     aijkok->a_dual.sync_host();
117     *array = aijkok->a_dual.view_host().data();
118   } else { /* Happens when calling MatSetValues on a newly created matrix */
119     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
120   }
121   PetscFunctionReturn(PETSC_SUCCESS);
122 }
123 
124 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127 
128   PetscFunctionBegin;
129   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
130   PetscFunctionReturn(PETSC_SUCCESS);
131 }
132 
133 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
134 {
135   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
136 
137   PetscFunctionBegin;
138   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
139     aijkok->a_dual.sync_host();
140     *array = aijkok->a_dual.view_host().data();
141   } else {
142     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
143   }
144   PetscFunctionReturn(PETSC_SUCCESS);
145 }
146 
147 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
148 {
149   PetscFunctionBegin;
150   *array = NULL;
151   PetscFunctionReturn(PETSC_SUCCESS);
152 }
153 
154 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
155 {
156   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
157 
158   PetscFunctionBegin;
159   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
160     *array = aijkok->a_dual.view_host().data();
161   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
162     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
163   }
164   PetscFunctionReturn(PETSC_SUCCESS);
165 }
166 
167 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
168 {
169   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
170 
171   PetscFunctionBegin;
172   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
173     aijkok->a_dual.clear_sync_state();
174     aijkok->a_dual.modify_host();
175   }
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
180 {
181   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
182 
183   PetscFunctionBegin;
184   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
185 
186   if (i) *i = aijkok->i_device_data();
187   if (j) *j = aijkok->j_device_data();
188   if (a) {
189     aijkok->a_dual.sync_device();
190     *a = aijkok->a_device_data();
191   }
192   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
193   PetscFunctionReturn(PETSC_SUCCESS);
194 }
195 
196 /*
197   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
198 
199   Input Parameter:
200 .  A       - the MATSEQAIJKOKKOS matrix
201 
202   Output Parameters:
203 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
204 -  T_d    - the transpose on device, whose value array is allocated but not initialized
205 */
206 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
207 {
208   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
209   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
210   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
211   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
212   MatRowMapType          *Ti = Ti_h.data();
213   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
214   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
215   PetscInt               *Tj   = Tj_h.data();
216   PetscInt               *perm = perm_h.data();
217   PetscInt               *offset;
218 
219   PetscFunctionBegin;
220   // Populate Ti
221   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
222   Ti++;
223   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
224   Ti--;
225   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
226 
227   // Populate Tj and the permutation array
228   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
229   for (PetscInt i = 0; i < m; i++) {
230     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
231       PetscInt r    = Aj[j];                       // row r of T
232       PetscInt disp = Ti[r] + offset[r];
233 
234       Tj[disp]   = i; // col i of T
235       perm[disp] = j;
236       offset[r]++;
237     }
238   }
239   PetscCall(PetscFree(offset));
240 
241   // Sort each row of T, along with the permutation array
242   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
243 
244   // Output perm and T on device
245   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
246   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
247   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
248   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
249   PetscFunctionReturn(PETSC_SUCCESS);
250 }
251 
252 // Generate the transpose on device and cache it internally
253 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
254 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
255 {
256   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
257   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
258   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
259   KokkosCsrMatrix  &T = akok->csrmatT;
260 
261   PetscFunctionBegin;
262   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
263   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device
264 
265   const auto &Aa = akok->a_dual.view_device();
266 
267   if (A->symmetric == PETSC_BOOL3_TRUE) {
268     *csrmatT = akok->csrmat;
269   } else {
270     // See if we already have a cached transpose and its value is up to date
271     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
272       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
273         const auto &perm = akok->transpose_perm; // get the permutation array
274         auto       &Ta   = T.values;
275 
276         PetscCallCXX(Kokkos::parallel_for(
277           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
278       }
279     } else { // Generate T of size n x m for the first time
280       MatRowMapKokkosView perm;
281 
282       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
283       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
284       PetscCallCXX(Kokkos::parallel_for(
285         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
286     }
287     akok->transpose_updated = PETSC_TRUE;
288     *csrmatT                = akok->csrmatT;
289   }
290   PetscFunctionReturn(PETSC_SUCCESS);
291 }
292 
293 // Generate the Hermitian on device and cache it internally
294 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
295 {
296   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
297   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
298   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
299   KokkosCsrMatrix  &T = akok->csrmatH;
300 
301   PetscFunctionBegin;
302   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
303   PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
304 
305   const auto &Aa = akok->a_dual.view_device();
306 
307   if (A->hermitian == PETSC_BOOL3_TRUE) {
308     *csrmatH = akok->csrmat;
309   } else {
310     // See if we already have a cached hermitian and its value is up to date
311     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
312       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
313         const auto &perm = akok->transpose_perm; // get the permutation array
314         auto       &Ta   = T.values;
315 
316         PetscCallCXX(Kokkos::parallel_for(
317           nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
318       }
319     } else { // Generate T of size n x m for the first time
320       MatRowMapKokkosView perm;
321 
322       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
323       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
324       PetscCallCXX(Kokkos::parallel_for(
325         nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
326     }
327     akok->hermitian_updated = PETSC_TRUE;
328     *csrmatH                = akok->csrmatH;
329   }
330   PetscFunctionReturn(PETSC_SUCCESS);
331 }
332 
333 /* y = A x */
334 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
335 {
336   Mat_SeqAIJKokkos          *aijkok;
337   ConstPetscScalarKokkosView xv;
338   PetscScalarKokkosView      yv;
339 
340   PetscFunctionBegin;
341   PetscCall(PetscLogGpuTimeBegin());
342   PetscCall(MatSeqAIJKokkosSyncDevice(A));
343   PetscCall(VecGetKokkosView(xx, &xv));
344   PetscCall(VecGetKokkosViewWrite(yy, &yv));
345   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
346   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
347   PetscCall(VecRestoreKokkosView(xx, &xv));
348   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
349   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
350   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
351   PetscCall(PetscLogGpuTimeEnd());
352   PetscFunctionReturn(PETSC_SUCCESS);
353 }
354 
355 /* y = A^T x */
356 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
357 {
358   Mat_SeqAIJKokkos          *aijkok;
359   const char                *mode;
360   ConstPetscScalarKokkosView xv;
361   PetscScalarKokkosView      yv;
362   KokkosCsrMatrix            csrmat;
363 
364   PetscFunctionBegin;
365   PetscCall(PetscLogGpuTimeBegin());
366   PetscCall(MatSeqAIJKokkosSyncDevice(A));
367   PetscCall(VecGetKokkosView(xx, &xv));
368   PetscCall(VecGetKokkosViewWrite(yy, &yv));
369   if (A->form_explicit_transpose) {
370     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
371     mode = "N";
372   } else {
373     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
374     csrmat = aijkok->csrmat;
375     mode   = "T";
376   }
377   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
378   PetscCall(VecRestoreKokkosView(xx, &xv));
379   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
380   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
381   PetscCall(PetscLogGpuTimeEnd());
382   PetscFunctionReturn(PETSC_SUCCESS);
383 }
384 
385 /* y = A^H x */
386 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
387 {
388   Mat_SeqAIJKokkos          *aijkok;
389   const char                *mode;
390   ConstPetscScalarKokkosView xv;
391   PetscScalarKokkosView      yv;
392   KokkosCsrMatrix            csrmat;
393 
394   PetscFunctionBegin;
395   PetscCall(PetscLogGpuTimeBegin());
396   PetscCall(MatSeqAIJKokkosSyncDevice(A));
397   PetscCall(VecGetKokkosView(xx, &xv));
398   PetscCall(VecGetKokkosViewWrite(yy, &yv));
399   if (A->form_explicit_transpose) {
400     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
401     mode = "N";
402   } else {
403     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
404     csrmat = aijkok->csrmat;
405     mode   = "C";
406   }
407   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
408   PetscCall(VecRestoreKokkosView(xx, &xv));
409   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
410   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
411   PetscCall(PetscLogGpuTimeEnd());
412   PetscFunctionReturn(PETSC_SUCCESS);
413 }
414 
415 /* z = A x + y */
416 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
417 {
418   Mat_SeqAIJKokkos          *aijkok;
419   ConstPetscScalarKokkosView xv, yv;
420   PetscScalarKokkosView      zv;
421 
422   PetscFunctionBegin;
423   PetscCall(PetscLogGpuTimeBegin());
424   PetscCall(MatSeqAIJKokkosSyncDevice(A));
425   PetscCall(VecGetKokkosView(xx, &xv));
426   PetscCall(VecGetKokkosView(yy, &yv));
427   PetscCall(VecGetKokkosViewWrite(zz, &zv));
428   if (zz != yy) Kokkos::deep_copy(zv, yv);
429   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
430   PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
431   PetscCall(VecRestoreKokkosView(xx, &xv));
432   PetscCall(VecRestoreKokkosView(yy, &yv));
433   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
434   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
435   PetscCall(PetscLogGpuTimeEnd());
436   PetscFunctionReturn(PETSC_SUCCESS);
437 }
438 
439 /* z = A^T x + y */
440 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
441 {
442   Mat_SeqAIJKokkos          *aijkok;
443   const char                *mode;
444   ConstPetscScalarKokkosView xv, yv;
445   PetscScalarKokkosView      zv;
446   KokkosCsrMatrix            csrmat;
447 
448   PetscFunctionBegin;
449   PetscCall(PetscLogGpuTimeBegin());
450   PetscCall(MatSeqAIJKokkosSyncDevice(A));
451   PetscCall(VecGetKokkosView(xx, &xv));
452   PetscCall(VecGetKokkosView(yy, &yv));
453   PetscCall(VecGetKokkosViewWrite(zz, &zv));
454   if (zz != yy) Kokkos::deep_copy(zv, yv);
455   if (A->form_explicit_transpose) {
456     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
457     mode = "N";
458   } else {
459     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
460     csrmat = aijkok->csrmat;
461     mode   = "T";
462   }
463   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
464   PetscCall(VecRestoreKokkosView(xx, &xv));
465   PetscCall(VecRestoreKokkosView(yy, &yv));
466   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
467   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
468   PetscCall(PetscLogGpuTimeEnd());
469   PetscFunctionReturn(PETSC_SUCCESS);
470 }
471 
472 /* z = A^H x + y */
473 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
474 {
475   Mat_SeqAIJKokkos          *aijkok;
476   const char                *mode;
477   ConstPetscScalarKokkosView xv, yv;
478   PetscScalarKokkosView      zv;
479   KokkosCsrMatrix            csrmat;
480 
481   PetscFunctionBegin;
482   PetscCall(PetscLogGpuTimeBegin());
483   PetscCall(MatSeqAIJKokkosSyncDevice(A));
484   PetscCall(VecGetKokkosView(xx, &xv));
485   PetscCall(VecGetKokkosView(yy, &yv));
486   PetscCall(VecGetKokkosViewWrite(zz, &zv));
487   if (zz != yy) Kokkos::deep_copy(zv, yv);
488   if (A->form_explicit_transpose) {
489     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
490     mode = "N";
491   } else {
492     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
493     csrmat = aijkok->csrmat;
494     mode   = "C";
495   }
496   PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
497   PetscCall(VecRestoreKokkosView(xx, &xv));
498   PetscCall(VecRestoreKokkosView(yy, &yv));
499   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
500   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
501   PetscCall(PetscLogGpuTimeEnd());
502   PetscFunctionReturn(PETSC_SUCCESS);
503 }
504 
505 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
506 {
507   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
508 
509   PetscFunctionBegin;
510   switch (op) {
511   case MAT_FORM_EXPLICIT_TRANSPOSE:
512     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
513     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
514     A->form_explicit_transpose = flg;
515     break;
516   default:
517     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
518     break;
519   }
520   PetscFunctionReturn(PETSC_SUCCESS);
521 }
522 
523 /* Depending on reuse, either build a new mat, or use the existing mat */
524 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
525 {
526   Mat_SeqAIJ *aseq;
527 
528   PetscFunctionBegin;
529   PetscCall(PetscKokkosInitializeCheck());
530   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
531     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
532   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
533     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
534   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
535     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
536     PetscCall(PetscFree(A->defaultvectype));
537     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
538     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
539     PetscCall(MatSetOps_SeqAIJKokkos(A));
540     aseq = static_cast<Mat_SeqAIJ *>(A->data);
541     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
542       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
543       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
544     }
545   }
546   PetscFunctionReturn(PETSC_SUCCESS);
547 }
548 
549 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
550    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
551  */
552 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
553 {
554   Mat_SeqAIJ       *bseq;
555   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
556   Mat               mat;
557 
558   PetscFunctionBegin;
559   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
560   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
561   mat = *B;
562   if (A->preallocated) {
563     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
564     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
565     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
566     /* Now copy values to B if needed */
567     if (dupOption == MAT_COPY_VALUES) {
568       if (akok->a_dual.need_sync_device()) {
569         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
570         bkok->a_dual.modify_host();
571       } else { /* If device has the latest data, we only copy data on device */
572         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
573         bkok->a_dual.modify_device();
574       }
575     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
576       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
577       bkok->a_dual.modify_host();
578     }
579     mat->spptr = bkok;
580   }
581 
582   PetscCall(PetscFree(mat->defaultvectype));
583   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
584   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
585   PetscCall(MatSetOps_SeqAIJKokkos(mat));
586   PetscFunctionReturn(PETSC_SUCCESS);
587 }
588 
589 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
590 {
591   Mat               At;
592   KokkosCsrMatrix   internT;
593   Mat_SeqAIJKokkos *atkok, *bkok;
594 
595   PetscFunctionBegin;
596   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
597   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
598   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
599     /* Deep copy internT, as we want to isolate the internal transpose */
600     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
601     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
602     if (reuse == MAT_INITIAL_MATRIX) *B = At;
603     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
604   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
605     if ((*B)->assembled) {
606       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
607       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
608       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
609     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
610       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
611       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
612       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
613       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
614       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
615     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
616   }
617   PetscFunctionReturn(PETSC_SUCCESS);
618 }
619 
620 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
621 {
622   Mat_SeqAIJKokkos *aijkok;
623 
624   PetscFunctionBegin;
625   if (A->factortype == MAT_FACTOR_NONE) {
626     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
627     delete aijkok;
628   } else {
629     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
630   }
631   A->spptr = NULL;
632   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
633   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
634   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
635   PetscCall(MatDestroy_SeqAIJ(A));
636   PetscFunctionReturn(PETSC_SUCCESS);
637 }
638 
639 /*MC
640    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
641 
642    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
643 
644    Options Database Key:
645 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
646 
647   Level: beginner
648 
649 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
650 M*/
651 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
652 {
653   PetscFunctionBegin;
654   PetscCall(PetscKokkosInitializeCheck());
655   PetscCall(MatCreate_SeqAIJ(A));
656   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
657   PetscFunctionReturn(PETSC_SUCCESS);
658 }
659 
660 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
661 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
662 {
663   Mat_SeqAIJ         *a, *b;
664   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
665   MatScalarKokkosView aa, ba, ca;
666   MatRowMapKokkosView ai, bi, ci;
667   MatColIdxKokkosView aj, bj, cj;
668   PetscInt            m, n, nnz, aN;
669 
670   PetscFunctionBegin;
671   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
672   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
673   PetscValidPointer(C, 4);
674   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
675   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
676   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
677   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
678 
679   PetscCall(MatSeqAIJKokkosSyncDevice(A));
680   PetscCall(MatSeqAIJKokkosSyncDevice(B));
681   a    = static_cast<Mat_SeqAIJ *>(A->data);
682   b    = static_cast<Mat_SeqAIJ *>(B->data);
683   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
684   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
685   aa   = akok->a_dual.view_device();
686   ai   = akok->i_dual.view_device();
687   ba   = bkok->a_dual.view_device();
688   bi   = bkok->i_dual.view_device();
689   m    = A->rmap->n; /* M, N and nnz of C */
690   n    = A->cmap->n + B->cmap->n;
691   nnz  = a->nz + b->nz;
692   aN   = A->cmap->n; /* N of A */
693   if (reuse == MAT_INITIAL_MATRIX) {
694     aj           = akok->j_dual.view_device();
695     bj           = bkok->j_dual.view_device();
696     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
697     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
698     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
699     ca           = ca_dual.view_device();
700     ci           = ci_dual.view_device();
701     cj           = cj_dual.view_device();
702 
703     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
704     Kokkos::parallel_for(
705       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
706         PetscInt i       = t.league_rank(); /* row i */
707         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
708 
709         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
710                                                    ci(i) = coffset;
711                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
712         });
713 
714         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
715           if (k < alen) {
716             ca(coffset + k) = aa(ai(i) + k);
717             cj(coffset + k) = aj(ai(i) + k);
718           } else {
719             ca(coffset + k) = ba(bi(i) + k - alen);
720             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
721           }
722         });
723       });
724     ca_dual.modify_device();
725     ci_dual.modify_device();
726     cj_dual.modify_device();
727     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
728     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
729   } else if (reuse == MAT_REUSE_MATRIX) {
730     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
731     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
732     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
733     ca   = ckok->a_dual.view_device();
734     ci   = ckok->i_dual.view_device();
735 
736     Kokkos::parallel_for(
737       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
738         PetscInt i    = t.league_rank(); /* row i */
739         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
740         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
741           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
742           else ca(ci(i) + k) = ba(bi(i) + k - alen);
743         });
744       });
745     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
746   }
747   PetscFunctionReturn(PETSC_SUCCESS);
748 }
749 
750 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
751 {
752   PetscFunctionBegin;
753   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
754   PetscFunctionReturn(PETSC_SUCCESS);
755 }
756 
757 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
758 {
759   Mat_Product                 *product = C->product;
760   Mat                          A, B;
761   bool                         transA, transB; /* use bool, since KK needs this type */
762   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
763   Mat_SeqAIJ                  *c;
764   MatProductData_SeqAIJKokkos *pdata;
765   KokkosCsrMatrix              csrmatA, csrmatB;
766 
767   PetscFunctionBegin;
768   MatCheckProduct(C, 1);
769   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
770   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
771 
772   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
773   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
774   // we still do numeric.
775   if (pdata->reusesym) { // numeric reuses results from symbolic
776     pdata->reusesym = PETSC_FALSE;
777     PetscFunctionReturn(PETSC_SUCCESS);
778   }
779 
780   switch (product->type) {
781   case MATPRODUCT_AB:
782     transA = false;
783     transB = false;
784     break;
785   case MATPRODUCT_AtB:
786     transA = true;
787     transB = false;
788     break;
789   case MATPRODUCT_ABt:
790     transA = false;
791     transB = true;
792     break;
793   default:
794     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
795   }
796 
797   A = product->A;
798   B = product->B;
799   PetscCall(MatSeqAIJKokkosSyncDevice(A));
800   PetscCall(MatSeqAIJKokkosSyncDevice(B));
801   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
802   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
803   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
804 
805   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
806 
807   csrmatA = akok->csrmat;
808   csrmatB = bkok->csrmat;
809 
810   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
811   if (transA) {
812     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
813     transA = false;
814   }
815 
816   if (transB) {
817     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
818     transB = false;
819   }
820   PetscCall(PetscLogGpuTimeBegin());
821   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
822 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
823   auto spgemmHandle = pdata->kh.get_spgemm_handle();
824   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
825 #endif
826 
827   PetscCall(PetscLogGpuTimeEnd());
828   PetscCall(MatSeqAIJKokkosModifyDevice(C));
829   /* shorter version of MatAssemblyEnd_SeqAIJ */
830   c = (Mat_SeqAIJ *)C->data;
831   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
832   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
833   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
834   c->reallocs         = 0;
835   C->info.mallocs     = 0;
836   C->info.nz_unneeded = 0;
837   C->assembled = C->was_assembled = PETSC_TRUE;
838   C->num_ass++;
839   PetscFunctionReturn(PETSC_SUCCESS);
840 }
841 
842 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
843 {
844   Mat_Product                 *product = C->product;
845   MatProductType               ptype;
846   Mat                          A, B;
847   bool                         transA, transB;
848   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
849   MatProductData_SeqAIJKokkos *pdata;
850   MPI_Comm                     comm;
851   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;
852 
853   PetscFunctionBegin;
854   MatCheckProduct(C, 1);
855   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
856   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
857   A = product->A;
858   B = product->B;
859   PetscCall(MatSeqAIJKokkosSyncDevice(A));
860   PetscCall(MatSeqAIJKokkosSyncDevice(B));
861   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
862   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
863   csrmatA = akok->csrmat;
864   csrmatB = bkok->csrmat;
865 
866   ptype = product->type;
867   // Take advantage of the symmetry if true
868   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
869     ptype                                          = MATPRODUCT_AB;
870     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
871   }
872   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
873     ptype                                          = MATPRODUCT_AB;
874     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
875   }
876 
877   switch (ptype) {
878   case MATPRODUCT_AB:
879     transA = false;
880     transB = false;
881     break;
882   case MATPRODUCT_AtB:
883     transA = true;
884     transB = false;
885     break;
886   case MATPRODUCT_ABt:
887     transA = false;
888     transB = true;
889     break;
890   default:
891     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
892   }
893   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
894   pdata->reusesym = product->api_user;
895 
896   /* TODO: add command line options to select spgemm algorithms */
897   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
898 
899   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
900 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
901   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
902   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
903   #endif
904 #endif
905   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
906 
907   PetscCall(PetscLogGpuTimeBegin());
908   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
909   if (transA) {
910     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
911     transA = false;
912   }
913 
914   if (transB) {
915     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
916     transB = false;
917   }
918 
919   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
920   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
921     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
922     calling new Mat_SeqAIJKokkos().
923     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
924   */
925   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
926 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
927   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
928   auto spgemmHandle = pdata->kh.get_spgemm_handle();
929   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
930 #endif
931   PetscCall(PetscLogGpuTimeEnd());
932 
933   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
934   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
935   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
936   PetscFunctionReturn(PETSC_SUCCESS);
937 }
938 
939 /* handles sparse matrix matrix ops */
940 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
941 {
942   Mat_Product *product = mat->product;
943   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
944 
945   PetscFunctionBegin;
946   MatCheckProduct(mat, 1);
947   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
948   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
949   if (Biskok && Ciskok) {
950     switch (product->type) {
951     case MATPRODUCT_AB:
952     case MATPRODUCT_AtB:
953     case MATPRODUCT_ABt:
954       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
955       break;
956     case MATPRODUCT_PtAP:
957     case MATPRODUCT_RARt:
958     case MATPRODUCT_ABC:
959       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
960       break;
961     default:
962       break;
963     }
964   } else { /* fallback for AIJ */
965     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
966   }
967   PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 
970 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
971 {
972   Mat_SeqAIJKokkos *aijkok;
973 
974   PetscFunctionBegin;
975   PetscCall(PetscLogGpuTimeBegin());
976   PetscCall(MatSeqAIJKokkosSyncDevice(A));
977   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
978   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
979   PetscCall(MatSeqAIJKokkosModifyDevice(A));
980   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
981   PetscCall(PetscLogGpuTimeEnd());
982   PetscFunctionReturn(PETSC_SUCCESS);
983 }
984 
985 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
986 {
987   Mat_SeqAIJKokkos *aijkok;
988 
989   PetscFunctionBegin;
990   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
991   if (aijkok) { /* Only zero the device if data is already there */
992     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
993     PetscCall(MatSeqAIJKokkosModifyDevice(A));
994   } else { /* Might be preallocated but not assembled */
995     PetscCall(MatZeroEntries_SeqAIJ(A));
996   }
997   PetscFunctionReturn(PETSC_SUCCESS);
998 }
999 
1000 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1001 {
1002   Mat_SeqAIJ           *aijseq;
1003   Mat_SeqAIJKokkos     *aijkok;
1004   PetscInt              n;
1005   PetscScalarKokkosView xv;
1006 
1007   PetscFunctionBegin;
1008   PetscCall(VecGetLocalSize(x, &n));
1009   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1010   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1011 
1012   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1013   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1014 
1015   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
1016     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1017     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1018     aijkok->SetDiagonal(aijseq->diag);
1019   }
1020 
1021   const auto &Aa    = aijkok->a_dual.view_device();
1022   const auto &Ai    = aijkok->i_dual.view_device();
1023   const auto &Adiag = aijkok->diag_dual.view_device();
1024 
1025   PetscCall(VecGetKokkosViewWrite(x, &xv));
1026   Kokkos::parallel_for(
1027     n, KOKKOS_LAMBDA(const PetscInt i) {
1028       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1029       else xv(i) = 0;
1030     });
1031   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1032   PetscFunctionReturn(PETSC_SUCCESS);
1033 }
1034 
1035 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1036 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1037 {
1038   Mat_SeqAIJKokkos *aijkok;
1039 
1040   PetscFunctionBegin;
1041   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1042   PetscValidPointer(kv, 2);
1043   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1044   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1045   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1046   *kv    = aijkok->a_dual.view_device();
1047   PetscFunctionReturn(PETSC_SUCCESS);
1048 }
1049 
1050 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1051 {
1052   PetscFunctionBegin;
1053   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1054   PetscValidPointer(kv, 2);
1055   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1056   PetscFunctionReturn(PETSC_SUCCESS);
1057 }
1058 
1059 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1060 {
1061   Mat_SeqAIJKokkos *aijkok;
1062 
1063   PetscFunctionBegin;
1064   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1065   PetscValidPointer(kv, 2);
1066   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1067   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1068   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1069   *kv    = aijkok->a_dual.view_device();
1070   PetscFunctionReturn(PETSC_SUCCESS);
1071 }
1072 
1073 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1074 {
1075   PetscFunctionBegin;
1076   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1077   PetscValidPointer(kv, 2);
1078   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1079   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1080   PetscFunctionReturn(PETSC_SUCCESS);
1081 }
1082 
1083 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1084 {
1085   Mat_SeqAIJKokkos *aijkok;
1086 
1087   PetscFunctionBegin;
1088   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1089   PetscValidPointer(kv, 2);
1090   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1091   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1092   *kv    = aijkok->a_dual.view_device();
1093   PetscFunctionReturn(PETSC_SUCCESS);
1094 }
1095 
1096 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1097 {
1098   PetscFunctionBegin;
1099   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1100   PetscValidPointer(kv, 2);
1101   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1102   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1103   PetscFunctionReturn(PETSC_SUCCESS);
1104 }
1105 
1106 /* Computes Y += alpha X */
1107 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1108 {
1109   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1110   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1111   ConstMatScalarKokkosView Xa;
1112   MatScalarKokkosView      Ya;
1113 
1114   PetscFunctionBegin;
1115   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1116   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1117   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1118   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1119   PetscCall(PetscLogGpuTimeBegin());
1120 
1121   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1122     PetscBool e;
1123     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1124     if (e) {
1125       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1126       if (e) pattern = SAME_NONZERO_PATTERN;
1127     }
1128   }
1129 
1130   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1131     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1132     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1133     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1134   */
1135   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1136   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1137   Xa   = xkok->a_dual.view_device();
1138   Ya   = ykok->a_dual.view_device();
1139 
1140   if (pattern == SAME_NONZERO_PATTERN) {
1141     KokkosBlas::axpy(alpha, Xa, Ya);
1142     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1143   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1144     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1145     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1146 
1147     Kokkos::parallel_for(
1148       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1149         PetscInt i = t.league_rank(); // row i
1150         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1151           // Only one thread works in a team
1152           PetscInt p, q = Yi(i);
1153           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1154             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1155             if (Xj(p) == Yj(q)) {                        // Found it
1156               Ya(q) += alpha * Xa(p);
1157               q++;
1158             } else {
1159             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1160             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1161 #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1162               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1163 #else
1164               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1165 #endif
1166             }
1167           }
1168         });
1169       });
1170     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1171   } else { // different nonzero patterns
1172     Mat             Z;
1173     KokkosCsrMatrix zcsr;
1174     KernelHandle    kh;
1175     kh.create_spadd_handle(true); // X, Y are sorted
1176     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1177     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1178     zkok = new Mat_SeqAIJKokkos(zcsr);
1179     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1180     PetscCall(MatHeaderReplace(Y, &Z));
1181     kh.destroy_spadd_handle();
1182   }
1183   PetscCall(PetscLogGpuTimeEnd());
1184   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1185   PetscFunctionReturn(PETSC_SUCCESS);
1186 }
1187 
1188 struct MatCOOStruct_SeqAIJKokkos {
1189   PetscCount           n;
1190   PetscCount           Atot;
1191   PetscInt             nz;
1192   PetscCountKokkosView jmap;
1193   PetscCountKokkosView perm;
1194 
1195   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1196   {
1197     nz   = coo_h->nz;
1198     n    = coo_h->n;
1199     Atot = coo_h->Atot;
1200     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1201     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1202   }
1203 };
1204 
1205 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data)
1206 {
1207   PetscFunctionBegin;
1208   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data));
1209   PetscFunctionReturn(PETSC_SUCCESS);
1210 }
1211 
1212 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1213 {
1214   Mat_SeqAIJKokkos          *akok;
1215   Mat_SeqAIJ                *aseq;
1216   PetscContainer             container_h, container_d;
1217   MatCOOStruct_SeqAIJ       *coo_h;
1218   MatCOOStruct_SeqAIJKokkos *coo_d;
1219 
1220   PetscFunctionBegin;
1221   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1222   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1223   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1224   delete akok;
1225   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
1226   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1227 
1228   // Copy the COO struct to device
1229   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1230   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1231   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1232 
1233   // Put the COO struct in a container and then attach that to the matrix
1234   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
1235   PetscCall(PetscContainerSetPointer(container_d, coo_d));
1236   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos));
1237   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
1238   PetscCall(PetscContainerDestroy(&container_d));
1239   PetscFunctionReturn(PETSC_SUCCESS);
1240 }
1241 
1242 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1243 {
1244   MatScalarKokkosView        Aa;
1245   ConstMatScalarKokkosView   kv;
1246   PetscMemType               memtype;
1247   PetscContainer             container;
1248   MatCOOStruct_SeqAIJKokkos *coo;
1249 
1250   PetscFunctionBegin;
1251   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1252   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1253 
1254   const auto &n    = coo->n;
1255   const auto &Annz = coo->nz;
1256   const auto &jmap = coo->jmap;
1257   const auto &perm = coo->perm;
1258 
1259   PetscCall(PetscGetMemType(v, &memtype));
1260   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1261     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1262   } else {
1263     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1264   }
1265 
1266   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1267   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1268 
1269   Kokkos::parallel_for(
1270     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1271       PetscScalar sum = 0.0;
1272       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1273       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1274     });
1275 
1276   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1277   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1278   PetscFunctionReturn(PETSC_SUCCESS);
1279 }
1280 
1281 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1282 {
1283   PetscFunctionBegin;
1284   PetscCall(MatSeqAIJKokkosSyncHost(A));
1285   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1286   B->offloadmask = PETSC_OFFLOAD_CPU;
1287   PetscFunctionReturn(PETSC_SUCCESS);
1288 }
1289 
1290 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1291 {
1292   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1293 
1294   PetscFunctionBegin;
1295   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1296   A->boundtocpu  = PETSC_FALSE;
1297 
1298   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1299   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1300   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1301   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1302   A->ops->scale                     = MatScale_SeqAIJKokkos;
1303   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1304   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1305   A->ops->mult                      = MatMult_SeqAIJKokkos;
1306   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1307   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1308   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1309   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1310   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1311   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1312   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1313   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1314   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1315   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1316   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1317   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1318   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1319   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1320   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1321   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1322 
1323   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1324   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1325   PetscFunctionReturn(PETSC_SUCCESS);
1326 }
1327 
1328 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1329 {
1330   Mat_SeqAIJ *aseq;
1331   PetscInt    i, m, n;
1332 
1333   PetscFunctionBegin;
1334   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1335 
1336   m = akok->nrows();
1337   n = akok->ncols();
1338   PetscCall(MatSetSizes(A, m, n, m, n));
1339   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1340 
1341   /* Set up data structures of A as a MATSEQAIJ */
1342   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1343   aseq = (Mat_SeqAIJ *)(A)->data;
1344 
1345   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1346   akok->j_dual.sync_host();
1347 
1348   aseq->i            = akok->i_host_data();
1349   aseq->j            = akok->j_host_data();
1350   aseq->a            = akok->a_host_data();
1351   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1352   aseq->singlemalloc = PETSC_FALSE;
1353   aseq->free_a       = PETSC_FALSE;
1354   aseq->free_ij      = PETSC_FALSE;
1355   aseq->nz           = akok->nnz();
1356   aseq->maxnz        = aseq->nz;
1357 
1358   PetscCall(PetscMalloc1(m, &aseq->imax));
1359   PetscCall(PetscMalloc1(m, &aseq->ilen));
1360   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1361 
1362   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1363   akok->nonzerostate = A->nonzerostate;
1364   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1365   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1366   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1367   PetscFunctionReturn(PETSC_SUCCESS);
1368 }
1369 
1370 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1371 {
1372   PetscFunctionBegin;
1373   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1374   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1375   PetscFunctionReturn(PETSC_SUCCESS);
1376 }
1377 
1378 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1379 {
1380   Mat_SeqAIJKokkos *akok;
1381   PetscFunctionBegin;
1382   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1383   PetscCall(MatCreate(comm, A));
1384   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1385   PetscFunctionReturn(PETSC_SUCCESS);
1386 }
1387 
1388 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1389 
1390    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1391  */
1392 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1393 {
1394   PetscFunctionBegin;
1395   PetscCall(MatCreate(comm, A));
1396   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1397   PetscFunctionReturn(PETSC_SUCCESS);
1398 }
1399 
1400 /*@C
1401    MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1402    (the default parallel PETSc format). This matrix will ultimately be handled by
1403    Kokkos for calculations.
1404 
1405    Collective
1406 
1407    Input Parameters:
1408 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
1409 .  m - number of rows
1410 .  n - number of columns
1411 .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1412 -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1413 
1414    Output Parameter:
1415 .  A - the matrix
1416 
1417    Level: intermediate
1418 
1419    Notes:
1420    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1421    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1422    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1423 
1424    The AIJ format, also called
1425    compressed row storage, is fully compatible with standard Fortran
1426    storage.  That is, the stored row and column indices can begin at
1427    either one (as in Fortran) or zero.
1428 
1429    Specify the preallocated storage with either `nz` or `nnz` (not both).
1430    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1431    allocation.
1432 
1433 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1434 @*/
1435 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1436 {
1437   PetscFunctionBegin;
1438   PetscCall(PetscKokkosInitializeCheck());
1439   PetscCall(MatCreate(comm, A));
1440   PetscCall(MatSetSizes(*A, m, n, m, n));
1441   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1442   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1443   PetscFunctionReturn(PETSC_SUCCESS);
1444 }
1445 
1446 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1447 {
1448   PetscFunctionBegin;
1449   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1450   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1451   PetscFunctionReturn(PETSC_SUCCESS);
1452 }
1453 
1454 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1455 {
1456   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1457 
1458   PetscFunctionBegin;
1459   if (!factors->sptrsv_symbolic_completed) {
1460     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
1461     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
1462     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1463   }
1464   PetscFunctionReturn(PETSC_SUCCESS);
1465 }
1466 
1467 /* Check if we need to update factors etc for transpose solve */
1468 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1469 {
1470   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1471   MatColIdxType               n       = A->rmap->n;
1472 
1473   PetscFunctionBegin;
1474   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1475     /* Update L^T and do sptrsv symbolic */
1476     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0
1477     factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1478     factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1479 
1480     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
1481                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
1482 
1483     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1484       We have to sort the indices, until KK provides finer control options.
1485     */
1486     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
1487 
1488     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
1489 
1490     /* Update U^T and do sptrsv symbolic */
1491     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0
1492     factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1493     factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1494 
1495     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
1496                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
1497 
1498     /* Sort indices. See comments above */
1499     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
1500 
1501     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
1502     factors->transpose_updated = PETSC_TRUE;
1503   }
1504   PetscFunctionReturn(PETSC_SUCCESS);
1505 }
1506 
1507 /* Solve Ax = b, with A = LU */
1508 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x)
1509 {
1510   ConstPetscScalarKokkosView  bv;
1511   PetscScalarKokkosView       xv;
1512   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1513 
1514   PetscFunctionBegin;
1515   PetscCall(PetscLogGpuTimeBegin());
1516   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1517   PetscCall(VecGetKokkosView(b, &bv));
1518   PetscCall(VecGetKokkosViewWrite(x, &xv));
1519   /* Solve L tmpv = b */
1520   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
1521   /* Solve Ux = tmpv */
1522   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
1523   PetscCall(VecRestoreKokkosView(b, &bv));
1524   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1525   PetscCall(PetscLogGpuTimeEnd());
1526   PetscFunctionReturn(PETSC_SUCCESS);
1527 }
1528 
1529 /* Solve A^T x = b, where A^T = U^T L^T */
1530 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x)
1531 {
1532   ConstPetscScalarKokkosView  bv;
1533   PetscScalarKokkosView       xv;
1534   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1535 
1536   PetscFunctionBegin;
1537   PetscCall(PetscLogGpuTimeBegin());
1538   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1539   PetscCall(VecGetKokkosView(b, &bv));
1540   PetscCall(VecGetKokkosViewWrite(x, &xv));
1541   /* Solve U^T tmpv = b */
1542   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
1543 
1544   /* Solve L^T x = tmpv */
1545   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
1546   PetscCall(VecRestoreKokkosView(b, &bv));
1547   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1548   PetscCall(PetscLogGpuTimeEnd());
1549   PetscFunctionReturn(PETSC_SUCCESS);
1550 }
1551 
1552 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1553 {
1554   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1555   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1556   PetscInt                    fill_lev = info->levels;
1557 
1558   PetscFunctionBegin;
1559   PetscCall(PetscLogGpuTimeBegin());
1560   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1561 
1562   auto a_d = aijkok->a_dual.view_device();
1563   auto i_d = aijkok->i_dual.view_device();
1564   auto j_d = aijkok->j_dual.view_device();
1565 
1566   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
1567 
1568   B->assembled              = PETSC_TRUE;
1569   B->preallocated           = PETSC_TRUE;
1570   B->ops->solve             = MatSolve_SeqAIJKokkos;
1571   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
1572   B->ops->matsolve          = NULL;
1573   B->ops->matsolvetranspose = NULL;
1574   B->offloadmask            = PETSC_OFFLOAD_GPU;
1575 
1576   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1577   factors->transpose_updated         = PETSC_FALSE;
1578   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1579   /* TODO: log flops, but how to know that? */
1580   PetscCall(PetscLogGpuTimeEnd());
1581   PetscFunctionReturn(PETSC_SUCCESS);
1582 }
1583 
1584 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1585 {
1586   Mat_SeqAIJKokkos           *aijkok;
1587   Mat_SeqAIJ                 *b;
1588   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1589   PetscInt                    fill_lev = info->levels;
1590   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
1591   PetscInt                    n        = A->rmap->n;
1592 
1593   PetscFunctionBegin;
1594   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1595   /* Rebuild factors */
1596   if (factors) {
1597     factors->Destroy();
1598   } /* Destroy the old if it exists */
1599   else {
1600     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
1601   }
1602 
1603   /* Create a spiluk handle and then do symbolic factorization */
1604   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
1605   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
1606 
1607   auto spiluk_handle = factors->kh.get_spiluk_handle();
1608 
1609   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
1610   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
1611   Kokkos::realloc(factors->iU_d, n + 1);
1612   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
1613 
1614   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1615   auto i_d = aijkok->i_dual.view_device();
1616   auto j_d = aijkok->j_dual.view_device();
1617   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
1618   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1619 
1620   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1621   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
1622   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
1623   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
1624 
1625   /* TODO: add options to select sptrsv algorithms */
1626   /* Create sptrsv handles for L, U and their transpose */
1627 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1628   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1629 #else
1630   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1631 #endif
1632 
1633   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
1634   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
1635   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
1636   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
1637 
1638   /* Fill fields of the factor matrix B */
1639   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
1640   b     = (Mat_SeqAIJ *)B->data;
1641   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
1642   B->info.fill_ratio_given  = info->fill;
1643   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
1644 
1645   B->offloadmask          = PETSC_OFFLOAD_GPU;
1646   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1647   PetscFunctionReturn(PETSC_SUCCESS);
1648 }
1649 
1650 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type)
1651 {
1652   PetscFunctionBegin;
1653   *type = MATSOLVERKOKKOS;
1654   PetscFunctionReturn(PETSC_SUCCESS);
1655 }
1656 
1657 /*MC
1658   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1659   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
1660 
1661   Level: beginner
1662 
1663 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1664 M*/
1665 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1666 {
1667   PetscInt n = A->rmap->n;
1668 
1669   PetscFunctionBegin;
1670   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1671   PetscCall(MatSetSizes(*B, n, n, n, n));
1672   (*B)->factortype = ftype;
1673   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1674   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1675 
1676   if (ftype == MAT_FACTOR_LU) {
1677     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1678     (*B)->canuseordering        = PETSC_TRUE;
1679     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
1680   } else if (ftype == MAT_FACTOR_ILU) {
1681     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1682     (*B)->canuseordering         = PETSC_FALSE;
1683     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1684   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1685 
1686   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1687   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1688   PetscFunctionReturn(PETSC_SUCCESS);
1689 }
1690 
1691 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1692 {
1693   PetscFunctionBegin;
1694   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
1695   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
1696   PetscFunctionReturn(PETSC_SUCCESS);
1697 }
1698 
1699 /* Utility to print out a KokkosCsrMatrix for debugging */
1700 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
1701 {
1702   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1703   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1704   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1705   const PetscInt    *i  = iv.data();
1706   const PetscInt    *j  = jv.data();
1707   const PetscScalar *a  = av.data();
1708   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1709 
1710   PetscFunctionBegin;
1711   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1712   for (PetscInt k = 0; k < m; k++) {
1713     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
1714     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
1715     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1716   }
1717   PetscFunctionReturn(PETSC_SUCCESS);
1718 }
1719