xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 70faa4e68e85355a5b9d00c7669f5865fa0fdf3e)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   Mat_SeqAIJ       *aijseq;
29   Mat_SeqAIJKokkos *aijkok;
30 
31   PetscFunctionBegin;
32   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
33   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
34 
35   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
36   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
37 
38   /* If aijkok does not exist, we just copy i, j to device.
39      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
40      In both cases, we build a new aijkok structure.
41   */
42   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
43     delete aijkok;
44     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
45     A->spptr = aijkok;
46   }
47 
48   if (aijkok && aijkok->device_mat_d.data()) {
49     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
50   }
51   PetscFunctionReturn(0);
52 }
53 
54 /* Sync CSR data to device if not yet */
55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
56 {
57   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
58 
59   PetscFunctionBegin;
60   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
61   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
62   if (aijkok->a_dual.need_sync_device()) {
63     aijkok->a_dual.sync_device();
64     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
65     aijkok->hermitian_updated = PETSC_FALSE;
66   }
67   PetscFunctionReturn(0);
68 }
69 
70 /* Mark the CSR data on device as modified */
71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
72 {
73   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
74 
75   PetscFunctionBegin;
76   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
77   aijkok->a_dual.clear_sync_state();
78   aijkok->a_dual.modify_device();
79   aijkok->transpose_updated = PETSC_FALSE;
80   aijkok->hermitian_updated = PETSC_FALSE;
81   PetscCall(MatSeqAIJInvalidateDiagonal(A));
82   PetscCall(PetscObjectStateIncrease((PetscObject)A));
83   PetscFunctionReturn(0);
84 }
85 
86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
87 {
88   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
89 
90   PetscFunctionBegin;
91   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
92   /* We do not expect one needs factors on host  */
93   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
94   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
95   aijkok->a_dual.sync_host();
96   PetscFunctionReturn(0);
97 }
98 
99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
100 {
101   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
102 
103   PetscFunctionBegin;
104   if (aijkok) {
105     aijkok->a_dual.sync_host();
106     *array = aijkok->a_dual.view_host().data();
107   } else { /* Happens when calling MatSetValues on a newly created matrix */
108     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
109   }
110   PetscFunctionReturn(0);
111 }
112 
113 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
114 {
115   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
116 
117   PetscFunctionBegin;
118   if (aijkok) aijkok->a_dual.modify_host();
119   PetscFunctionReturn(0);
120 }
121 
122 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
123 {
124   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
125 
126   PetscFunctionBegin;
127   if (aijkok) {
128     aijkok->a_dual.sync_host();
129     *array = aijkok->a_dual.view_host().data();
130   } else {
131     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
132   }
133   PetscFunctionReturn(0);
134 }
135 
136 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
137 {
138   PetscFunctionBegin;
139   *array = NULL;
140   PetscFunctionReturn(0);
141 }
142 
143 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
144 {
145   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
146 
147   PetscFunctionBegin;
148   if (aijkok) {
149     *array = aijkok->a_dual.view_host().data();
150   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
151     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
152   }
153   PetscFunctionReturn(0);
154 }
155 
156 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
157 {
158   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
159 
160   PetscFunctionBegin;
161   if (aijkok) {
162     aijkok->a_dual.clear_sync_state();
163     aijkok->a_dual.modify_host();
164   }
165   PetscFunctionReturn(0);
166 }
167 
168 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
169 {
170   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
171 
172   PetscFunctionBegin;
173   PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL");
174 
175   if (i) *i = aijkok->i_device_data();
176   if (j) *j = aijkok->j_device_data();
177   if (a) {
178     aijkok->a_dual.sync_device();
179     *a = aijkok->a_device_data();
180   }
181   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
182   PetscFunctionReturn(0);
183 }
184 
185 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
186 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
187 {
188   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
189   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
190 
191   PetscFunctionBegin;
192   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
193   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
194   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
195   PetscFunctionReturn(0);
196 }
197 
198 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
199 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
200 {
201   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
202 
203   PetscFunctionBegin;
204   if (aijkok && aijkok->device_mat_d.data()) {
205     *d_mat = aijkok->device_mat_d.data();
206   } else {
207     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
208     *d_mat  = NULL;
209   }
210   PetscFunctionReturn(0);
211 }
212 
213 /* Generate the transpose on device and cache it internally */
214 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
215 {
216   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
217 
218   PetscFunctionBegin;
219   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
220   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
221     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
222     PetscCallCXX(aijkok->a_dual.sync_device());
223     PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
224     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
225     aijkok->transpose_updated = PETSC_TRUE;
226   }
227   *csrmatT = &aijkok->csrmatT;
228   PetscFunctionReturn(0);
229 }
230 
231 /* Generate the Hermitian on device and cache it internally */
232 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
233 {
234   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
235 
236   PetscFunctionBegin;
237   PetscCall(PetscLogGpuTimeBegin());
238   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
239   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
240     PetscCallCXX(aijkok->a_dual.sync_device());
241     PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
242     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
243    #if defined(PETSC_USE_COMPLEX)
244     const auto& a = aijkok->csrmatH.values;
245     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
246    #endif
247     aijkok->hermitian_updated = PETSC_TRUE;
248   }
249   *csrmatH = &aijkok->csrmatH;
250   PetscCall(PetscLogGpuTimeEnd());
251   PetscFunctionReturn(0);
252 }
253 
254 /* y = A x */
255 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
256 {
257   Mat_SeqAIJKokkos                 *aijkok;
258   ConstPetscScalarKokkosView       xv;
259   PetscScalarKokkosView            yv;
260 
261   PetscFunctionBegin;
262   PetscCall(PetscLogGpuTimeBegin());
263   PetscCall(MatSeqAIJKokkosSyncDevice(A));
264   PetscCall(VecGetKokkosView(xx,&xv));
265   PetscCall(VecGetKokkosViewWrite(yy,&yv));
266   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
267   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
268   PetscCall(VecRestoreKokkosView(xx,&xv));
269   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
270   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
271   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
272   PetscCall(PetscLogGpuTimeEnd());
273   PetscFunctionReturn(0);
274 }
275 
276 /* y = A^T x */
277 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
278 {
279   Mat_SeqAIJKokkos                 *aijkok;
280   const char                       *mode;
281   ConstPetscScalarKokkosView       xv;
282   PetscScalarKokkosView            yv;
283   KokkosCsrMatrix                  *csrmat;
284 
285   PetscFunctionBegin;
286   PetscCall(PetscLogGpuTimeBegin());
287   PetscCall(MatSeqAIJKokkosSyncDevice(A));
288   PetscCall(VecGetKokkosView(xx,&xv));
289   PetscCall(VecGetKokkosViewWrite(yy,&yv));
290   if (A->form_explicit_transpose) {
291     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
292     mode = "N";
293   } else {
294     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
295     csrmat = &aijkok->csrmat;
296     mode = "T";
297   }
298   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
299   PetscCall(VecRestoreKokkosView(xx,&xv));
300   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
301   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
302   PetscCall(PetscLogGpuTimeEnd());
303   PetscFunctionReturn(0);
304 }
305 
306 /* y = A^H x */
307 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
308 {
309   Mat_SeqAIJKokkos                 *aijkok;
310   const char                       *mode;
311   ConstPetscScalarKokkosView       xv;
312   PetscScalarKokkosView            yv;
313   KokkosCsrMatrix                  *csrmat;
314 
315   PetscFunctionBegin;
316   PetscCall(PetscLogGpuTimeBegin());
317   PetscCall(MatSeqAIJKokkosSyncDevice(A));
318   PetscCall(VecGetKokkosView(xx,&xv));
319   PetscCall(VecGetKokkosViewWrite(yy,&yv));
320   if (A->form_explicit_transpose) {
321     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
322     mode = "N";
323   } else {
324     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
325     csrmat = &aijkok->csrmat;
326     mode = "C";
327   }
328   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
329   PetscCall(VecRestoreKokkosView(xx,&xv));
330   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
331   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
332   PetscCall(PetscLogGpuTimeEnd());
333   PetscFunctionReturn(0);
334 }
335 
336 /* z = A x + y */
337 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
338 {
339   Mat_SeqAIJKokkos                 *aijkok;
340   ConstPetscScalarKokkosView       xv,yv;
341   PetscScalarKokkosView            zv;
342 
343   PetscFunctionBegin;
344   PetscCall(PetscLogGpuTimeBegin());
345   PetscCall(MatSeqAIJKokkosSyncDevice(A));
346   PetscCall(VecGetKokkosView(xx,&xv));
347   PetscCall(VecGetKokkosView(yy,&yv));
348   PetscCall(VecGetKokkosViewWrite(zz,&zv));
349   if (zz != yy) Kokkos::deep_copy(zv,yv);
350   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
351   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
352   PetscCall(VecRestoreKokkosView(xx,&xv));
353   PetscCall(VecRestoreKokkosView(yy,&yv));
354   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
355   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
356   PetscCall(PetscLogGpuTimeEnd());
357   PetscFunctionReturn(0);
358 }
359 
360 /* z = A^T x + y */
361 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
362 {
363   Mat_SeqAIJKokkos                 *aijkok;
364   const char                       *mode;
365   ConstPetscScalarKokkosView       xv,yv;
366   PetscScalarKokkosView            zv;
367   KokkosCsrMatrix                  *csrmat;
368 
369   PetscFunctionBegin;
370   PetscCall(PetscLogGpuTimeBegin());
371   PetscCall(MatSeqAIJKokkosSyncDevice(A));
372   PetscCall(VecGetKokkosView(xx,&xv));
373   PetscCall(VecGetKokkosView(yy,&yv));
374   PetscCall(VecGetKokkosViewWrite(zz,&zv));
375   if (zz != yy) Kokkos::deep_copy(zv,yv);
376   if (A->form_explicit_transpose) {
377     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
378     mode = "N";
379   } else {
380     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
381     csrmat = &aijkok->csrmat;
382     mode = "T";
383   }
384   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
385   PetscCall(VecRestoreKokkosView(xx,&xv));
386   PetscCall(VecRestoreKokkosView(yy,&yv));
387   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
388   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
389   PetscCall(PetscLogGpuTimeEnd());
390   PetscFunctionReturn(0);
391 }
392 
393 /* z = A^H x + y */
394 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
395 {
396   Mat_SeqAIJKokkos                 *aijkok;
397   const char                       *mode;
398   ConstPetscScalarKokkosView       xv,yv;
399   PetscScalarKokkosView            zv;
400   KokkosCsrMatrix                  *csrmat;
401 
402   PetscFunctionBegin;
403   PetscCall(PetscLogGpuTimeBegin());
404   PetscCall(MatSeqAIJKokkosSyncDevice(A));
405   PetscCall(VecGetKokkosView(xx,&xv));
406   PetscCall(VecGetKokkosView(yy,&yv));
407   PetscCall(VecGetKokkosViewWrite(zz,&zv));
408   if (zz != yy) Kokkos::deep_copy(zv,yv);
409   if (A->form_explicit_transpose) {
410     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
411     mode = "N";
412   } else {
413     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
414     csrmat = &aijkok->csrmat;
415     mode = "C";
416   }
417   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
418   PetscCall(VecRestoreKokkosView(xx,&xv));
419   PetscCall(VecRestoreKokkosView(yy,&yv));
420   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
421   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
422   PetscCall(PetscLogGpuTimeEnd());
423   PetscFunctionReturn(0);
424 }
425 
426 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
427 {
428   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
429 
430   PetscFunctionBegin;
431   switch (op) {
432     case MAT_FORM_EXPLICIT_TRANSPOSE:
433       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
434       if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
435       A->form_explicit_transpose = flg;
436       break;
437     default:
438       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
439       break;
440   }
441   PetscFunctionReturn(0);
442 }
443 
444 /* Depending on reuse, either build a new mat, or use the existing mat */
445 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
446 {
447   Mat_SeqAIJ       *aseq;
448 
449   PetscFunctionBegin;
450   PetscCall(PetscKokkosInitializeCheck());
451   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
452     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
453   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
454     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
455   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
456     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
457     PetscCall(PetscFree(A->defaultvectype));
458     PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
459     PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
460     PetscCall(MatSetOps_SeqAIJKokkos(A));
461     aseq = static_cast<Mat_SeqAIJ*>(A->data);
462     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
463       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
464       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
465     }
466   }
467   PetscFunctionReturn(0);
468 }
469 
470 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
471    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
472  */
473 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
474 {
475   Mat_SeqAIJ            *bseq;
476   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
477   Mat                   mat;
478 
479   PetscFunctionBegin;
480   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
481   PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
482   mat  = *B;
483   if (A->assembled) {
484     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
485     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
486     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
487     /* Now copy values to B if needed */
488     if (dupOption == MAT_COPY_VALUES) {
489       if (akok->a_dual.need_sync_device()) {
490         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
491         bkok->a_dual.modify_host();
492       } else { /* If device has the latest data, we only copy data on device */
493         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
494         bkok->a_dual.modify_device();
495       }
496     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
497       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
498       bkok->a_dual.modify_host();
499     }
500     mat->spptr = bkok;
501   }
502 
503   PetscCall(PetscFree(mat->defaultvectype));
504   PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
505   PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
506   PetscCall(MatSetOps_SeqAIJKokkos(mat));
507   PetscFunctionReturn(0);
508 }
509 
510 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
511 {
512   Mat               At;
513   KokkosCsrMatrix   *internT;
514   Mat_SeqAIJKokkos  *atkok,*bkok;
515 
516   PetscFunctionBegin;
517   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
518   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
519     /* Deep copy internT, as we want to isolate the internal transpose */
520     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
521     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
522     if (reuse == MAT_INITIAL_MATRIX) *B = At;
523     else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
524   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
525     if ((*B)->assembled) {
526       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
527       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
528       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
529     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
530       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
531       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
532       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
533       PetscCallCXX(Kokkos::deep_copy(a_h,internT->values));
534       PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
535     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
536   }
537   PetscFunctionReturn(0);
538 }
539 
540 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
541 {
542   Mat_SeqAIJKokkos           *aijkok;
543 
544   PetscFunctionBegin;
545   if (A->factortype == MAT_FACTOR_NONE) {
546     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
547     delete aijkok;
548   } else {
549     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
550   }
551   A->spptr = NULL;
552   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
553   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
554   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
555   PetscCall(MatDestroy_SeqAIJ(A));
556   PetscFunctionReturn(0);
557 }
558 
559 /*MC
560    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
561 
562    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
563 
564    Options Database Keys:
565 .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
566 
567   Level: beginner
568 
569 .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS
570 M*/
571 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
572 {
573   PetscFunctionBegin;
574   PetscCall(PetscKokkosInitializeCheck());
575   PetscCall(MatCreate_SeqAIJ(A));
576   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
577   PetscFunctionReturn(0);
578 }
579 
580 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
581 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
582 {
583   Mat_SeqAIJ                   *a,*b;
584   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
585   MatScalarKokkosView          aa,ba,ca;
586   MatRowMapKokkosView          ai,bi,ci;
587   MatColIdxKokkosView          aj,bj,cj;
588   PetscInt                     m,n,nnz,aN;
589 
590   PetscFunctionBegin;
591   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
592   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
593   PetscValidPointer(C,4);
594   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
595   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
596   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
597   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
598 
599   PetscCall(MatSeqAIJKokkosSyncDevice(A));
600   PetscCall(MatSeqAIJKokkosSyncDevice(B));
601   a    = static_cast<Mat_SeqAIJ*>(A->data);
602   b    = static_cast<Mat_SeqAIJ*>(B->data);
603   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
604   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
605   aa   = akok->a_dual.view_device();
606   ai   = akok->i_dual.view_device();
607   ba   = bkok->a_dual.view_device();
608   bi   = bkok->i_dual.view_device();
609   m    = A->rmap->n; /* M, N and nnz of C */
610   n    = A->cmap->n + B->cmap->n;
611   nnz  = a->nz + b->nz;
612   aN   = A->cmap->n; /* N of A */
613   if (reuse == MAT_INITIAL_MATRIX) {
614     aj = akok->j_dual.view_device();
615     bj = bkok->j_dual.view_device();
616     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
617     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
618     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
619     ca = ca_dual.view_device();
620     ci = ci_dual.view_device();
621     cj = cj_dual.view_device();
622 
623     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
624     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
625       PetscInt i = t.league_rank(); /* row i */
626       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
627 
628       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
629         ci(i) = coffset;
630         if (i == m-1) ci(m) = ai(m) + bi(m);
631       });
632 
633       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
634         if (k < alen) {
635           ca(coffset+k) = aa(ai(i)+k);
636           cj(coffset+k) = aj(ai(i)+k);
637         } else {
638           ca(coffset+k) = ba(bi(i)+k-alen);
639           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
640         }
641       });
642     });
643     ca_dual.modify_device();
644     ci_dual.modify_device();
645     cj_dual.modify_device();
646     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
647     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
648   } else if (reuse == MAT_REUSE_MATRIX) {
649     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
650     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
651     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
652     ca   = ckok->a_dual.view_device();
653     ci   = ckok->i_dual.view_device();
654 
655     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
656       PetscInt i = t.league_rank(); /* row i */
657       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
658       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
659         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
660         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
661       });
662     });
663     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
664   }
665   PetscFunctionReturn(0);
666 }
667 
668 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
669 {
670   PetscFunctionBegin;
671   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
672   PetscFunctionReturn(0);
673 }
674 
675 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
676 {
677   Mat_Product                    *product = C->product;
678   Mat                            A,B;
679   bool                           transA,transB; /* use bool, since KK needs this type */
680   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
681   Mat_SeqAIJ                     *c;
682   MatProductData_SeqAIJKokkos    *pdata;
683   KokkosCsrMatrix                *csrmatA,*csrmatB;
684 
685   PetscFunctionBegin;
686   MatCheckProduct(C,1);
687   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
688   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
689 
690   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
691     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
692     PetscFunctionReturn(0);
693   }
694 
695   switch (product->type) {
696     case MATPRODUCT_AB:  transA = false; transB = false; break;
697     case MATPRODUCT_AtB: transA = true;  transB = false; break;
698     case MATPRODUCT_ABt: transA = false; transB = true;  break;
699     default:
700       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
701   }
702 
703   A     = product->A;
704   B     = product->B;
705   PetscCall(MatSeqAIJKokkosSyncDevice(A));
706   PetscCall(MatSeqAIJKokkosSyncDevice(B));
707   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
708   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
709   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
710 
711   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
712 
713   csrmatA = &akok->csrmat;
714   csrmatB = &bkok->csrmat;
715 
716   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
717   if (transA) {
718     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
719     transA = false;
720   }
721 
722   if (transB) {
723     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
724     transB = false;
725   }
726   PetscCall(PetscLogGpuTimeBegin());
727   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
728   PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
729   PetscCall(PetscLogGpuTimeEnd());
730   PetscCall(MatSeqAIJKokkosModifyDevice(C));
731   /* shorter version of MatAssemblyEnd_SeqAIJ */
732   c = (Mat_SeqAIJ*)C->data;
733   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
734   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
735   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
736   c->reallocs         = 0;
737   C->info.mallocs     = 0;
738   C->info.nz_unneeded = 0;
739   C->assembled        = C->was_assembled = PETSC_TRUE;
740   C->num_ass++;
741   PetscFunctionReturn(0);
742 }
743 
744 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
745 {
746   Mat_Product                    *product = C->product;
747   MatProductType                 ptype;
748   Mat                            A,B;
749   bool                           transA,transB;
750   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
751   MatProductData_SeqAIJKokkos    *pdata;
752   MPI_Comm                       comm;
753   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
754 
755   PetscFunctionBegin;
756   MatCheckProduct(C,1);
757   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
758   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
759   A       = product->A;
760   B       = product->B;
761   PetscCall(MatSeqAIJKokkosSyncDevice(A));
762   PetscCall(MatSeqAIJKokkosSyncDevice(B));
763   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
764   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
765   csrmatA = &akok->csrmat;
766   csrmatB = &bkok->csrmat;
767 
768   ptype   = product->type;
769   switch (ptype) {
770     case MATPRODUCT_AB:  transA = false; transB = false; break;
771     case MATPRODUCT_AtB: transA = true;  transB = false; break;
772     case MATPRODUCT_ABt: transA = false; transB = true;  break;
773     default:
774       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
775   }
776 
777   product->data = pdata = new MatProductData_SeqAIJKokkos();
778   pdata->kh.set_team_work_size(16);
779   pdata->kh.set_dynamic_scheduling(true);
780   pdata->reusesym = product->api_user;
781 
782   /* TODO: add command line options to select spgemm algorithms */
783   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
784   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
785      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
786    */
787   pdata->kh.create_spgemm_handle(spgemm_alg);
788 
789   PetscCall(PetscLogGpuTimeBegin());
790   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
791   if (transA) {
792     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
793     transA = false;
794   }
795 
796   if (transB) {
797     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
798     transB = false;
799   }
800 
801   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
802   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
803     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
804     calling new Mat_SeqAIJKokkos().
805     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
806   */
807   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
808   PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC));
809   PetscCall(PetscLogGpuTimeEnd());
810 
811   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
812   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
813   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
814   PetscFunctionReturn(0);
815 }
816 
817 /* handles sparse matrix matrix ops */
818 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
819 {
820   Mat_Product    *product = mat->product;
821   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
822 
823   PetscFunctionBegin;
824   MatCheckProduct(mat,1);
825   PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
826   if (product->type == MATPRODUCT_ABC) {
827     PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
828   }
829   if (Biskok && Ciskok) {
830     switch (product->type) {
831     case MATPRODUCT_AB:
832     case MATPRODUCT_AtB:
833     case MATPRODUCT_ABt:
834       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
835       break;
836     case MATPRODUCT_PtAP:
837     case MATPRODUCT_RARt:
838     case MATPRODUCT_ABC:
839       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
840       break;
841     default:
842       break;
843     }
844   } else { /* fallback for AIJ */
845     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
846   }
847   PetscFunctionReturn(0);
848 }
849 
850 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
851 {
852   Mat_SeqAIJKokkos *aijkok;
853 
854   PetscFunctionBegin;
855   PetscCall(PetscLogGpuTimeBegin());
856   PetscCall(MatSeqAIJKokkosSyncDevice(A));
857   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
858   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
859   PetscCall(MatSeqAIJKokkosModifyDevice(A));
860   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
861   PetscCall(PetscLogGpuTimeEnd());
862   PetscFunctionReturn(0);
863 }
864 
865 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
866 {
867   Mat_SeqAIJKokkos *aijkok;
868 
869   PetscFunctionBegin;
870   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
871   if (aijkok) { /* Only zero the device if data is already there */
872     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
873     PetscCall(MatSeqAIJKokkosModifyDevice(A));
874   } else { /* Might be preallocated but not assembled */
875     PetscCall(MatZeroEntries_SeqAIJ(A));
876   }
877   PetscFunctionReturn(0);
878 }
879 
880 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
881 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
882 {
883   Mat_SeqAIJKokkos   *aijkok;
884 
885   PetscFunctionBegin;
886   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
887   PetscValidPointer(kv,2);
888   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
889   PetscCall(MatSeqAIJKokkosSyncDevice(A));
890   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
891   *kv    = aijkok->a_dual.view_device();
892   PetscFunctionReturn(0);
893 }
894 
895 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
896 {
897   PetscFunctionBegin;
898   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
899   PetscValidPointer(kv,2);
900   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
901   PetscFunctionReturn(0);
902 }
903 
904 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
905 {
906   Mat_SeqAIJKokkos   *aijkok;
907 
908   PetscFunctionBegin;
909   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
910   PetscValidPointer(kv,2);
911   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
912   PetscCall(MatSeqAIJKokkosSyncDevice(A));
913   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
914   *kv    = aijkok->a_dual.view_device();
915   PetscFunctionReturn(0);
916 }
917 
918 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
919 {
920   PetscFunctionBegin;
921   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
922   PetscValidPointer(kv,2);
923   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
924   PetscCall(MatSeqAIJKokkosModifyDevice(A));
925   PetscFunctionReturn(0);
926 }
927 
928 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
929 {
930   Mat_SeqAIJKokkos   *aijkok;
931 
932   PetscFunctionBegin;
933   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
934   PetscValidPointer(kv,2);
935   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
936   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
937   *kv    = aijkok->a_dual.view_device();
938   PetscFunctionReturn(0);
939 }
940 
941 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
942 {
943   PetscFunctionBegin;
944   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
945   PetscValidPointer(kv,2);
946   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
947   PetscCall(MatSeqAIJKokkosModifyDevice(A));
948   PetscFunctionReturn(0);
949 }
950 
951 /* Computes Y += alpha X */
952 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
953 {
954   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
955   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
956   ConstMatScalarKokkosView   Xa;
957   MatScalarKokkosView        Ya;
958 
959   PetscFunctionBegin;
960   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
961   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
962   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
963   PetscCall(MatSeqAIJKokkosSyncDevice(X));
964   PetscCall(PetscLogGpuTimeBegin());
965 
966   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
967     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
968     PetscBool e;
969     PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
970     if (e) {
971       PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e));
972       if (e) pattern = SAME_NONZERO_PATTERN;
973     }
974   }
975 
976   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
977     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
978     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
979     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
980   */
981   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
982   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
983   Xa   = xkok->a_dual.view_device();
984   Ya   = ykok->a_dual.view_device();
985 
986   if (pattern == SAME_NONZERO_PATTERN) {
987     KokkosBlas::axpy(alpha,Xa,Ya);
988     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
989   } else if (pattern == SUBSET_NONZERO_PATTERN) {
990     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
991     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
992 
993     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
994       PetscInt i = t.league_rank(); /* row i */
995       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
996         PetscInt p,q = Yi(i);
997         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
998           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
999           if (Xj(p) == Yj(q)) { /* Found it */
1000             Ya(q) += alpha * Xa(p);
1001             q++;
1002           } else {
1003             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1004                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1005             */
1006             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1007           }
1008         }
1009       });
1010     });
1011     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1012   } else { /* different nonzero patterns */
1013     Mat             Z;
1014     KokkosCsrMatrix zcsr;
1015     KernelHandle    kh;
1016     kh.create_spadd_handle(false);
1017     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1018     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1019     zkok = new Mat_SeqAIJKokkos(zcsr);
1020     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
1021     PetscCall(MatHeaderReplace(Y,&Z));
1022     kh.destroy_spadd_handle();
1023   }
1024   PetscCall(PetscLogGpuTimeEnd());
1025   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1026   PetscFunctionReturn(0);
1027 }
1028 
1029 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1030 {
1031   Mat_SeqAIJKokkos *akok;
1032   Mat_SeqAIJ       *aseq;
1033 
1034   PetscFunctionBegin;
1035   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1036   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1037   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1038   delete akok;
1039   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
1040   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1041   akok->SetUpCOO(aseq);
1042   PetscFunctionReturn(0);
1043 }
1044 
1045 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1046 {
1047   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1048   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1049   PetscCount                  Annz = aseq->nz;
1050   const PetscCountKokkosView& jmap = akok->jmap_d;
1051   const PetscCountKokkosView& perm = akok->perm_d;
1052   MatScalarKokkosView         Aa;
1053   ConstMatScalarKokkosView    kv;
1054   PetscMemType                memtype;
1055 
1056   PetscFunctionBegin;
1057   PetscCall(PetscGetMemType(v,&memtype));
1058   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1059     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1060   } else {
1061     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1062   }
1063 
1064   if (imode == INSERT_VALUES) {
1065     PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1066     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1067   } else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
1068 
1069   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));});
1070 
1071   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1072   else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa));
1073   PetscFunctionReturn(0);
1074 }
1075 
1076 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag)
1077 {
1078   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1079   MatScalarKokkosView         Aa;
1080   const MatRowMapKokkosView&  Ai = akok->i_dual.view_device();
1081   PetscInt                    m = A->rmap->n;
1082   ConstMatRowMapKokkosView    Adiag(diag,m); /* diag is a device pointer */
1083 
1084   PetscFunctionBegin;
1085   PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa));
1086   Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) {
1087     PetscScalar tmp;
1088     if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */
1089       tmp          = Aa(Ai(i));
1090       Aa(Ai(i))    = Aa(Adiag(i));
1091       Aa(Adiag(i)) = tmp;
1092     }
1093   });
1094   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1095   PetscFunctionReturn(0);
1096 }
1097 
1098 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1099 {
1100   PetscFunctionBegin;
1101   PetscCall(MatSeqAIJKokkosSyncHost(A));
1102   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
1103   B->offloadmask = PETSC_OFFLOAD_CPU;
1104   PetscFunctionReturn(0);
1105 }
1106 
1107 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1108 {
1109   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1110 
1111   PetscFunctionBegin;
1112   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1113   A->boundtocpu  = PETSC_FALSE;
1114 
1115   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1116   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1117   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1118   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1119   A->ops->scale                     = MatScale_SeqAIJKokkos;
1120   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1121   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1122   A->ops->mult                      = MatMult_SeqAIJKokkos;
1123   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1124   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1125   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1126   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1127   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1128   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1129   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1130   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1131   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1132   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1133   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1134   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1135   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1136   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1137   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1138 
1139   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
1140   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1141   PetscFunctionReturn(0);
1142 }
1143 
1144 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1145 {
1146   Mat_SeqAIJ *aseq;
1147   PetscInt    i,m,n;
1148 
1149   PetscFunctionBegin;
1150   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1151 
1152   m = akok->nrows();
1153   n = akok->ncols();
1154   PetscCall(MatSetSizes(A,m,n,m,n));
1155   PetscCall(MatSetType(A,MATSEQAIJKOKKOS));
1156 
1157   /* Set up data structures of A as a MATSEQAIJ */
1158   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1159   aseq = (Mat_SeqAIJ*)(A)->data;
1160 
1161   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1162   akok->j_dual.sync_host();
1163 
1164   aseq->i            = akok->i_host_data();
1165   aseq->j            = akok->j_host_data();
1166   aseq->a            = akok->a_host_data();
1167   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1168   aseq->singlemalloc = PETSC_FALSE;
1169   aseq->free_a       = PETSC_FALSE;
1170   aseq->free_ij      = PETSC_FALSE;
1171   aseq->nz           = akok->nnz();
1172   aseq->maxnz        = aseq->nz;
1173 
1174   PetscCall(PetscMalloc1(m,&aseq->imax));
1175   PetscCall(PetscMalloc1(m,&aseq->ilen));
1176   for (i=0; i<m; i++) {
1177     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1178   }
1179 
1180   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1181   akok->nonzerostate = A->nonzerostate;
1182   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1183   PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
1184   PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1185   PetscFunctionReturn(0);
1186 }
1187 
1188 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1189 
1190    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1191  */
1192 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1193 {
1194   PetscFunctionBegin;
1195   PetscCall(MatCreate(comm,A));
1196   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
1197   PetscFunctionReturn(0);
1198 }
1199 
1200 /* --------------------------------------------------------------------------------*/
1201 /*@C
1202    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1203    (the default parallel PETSc format). This matrix will ultimately be handled by
1204    Kokkos for calculations. For good matrix
1205    assembly performance the user should preallocate the matrix storage by setting
1206    the parameter nz (or the array nnz).  By setting these parameters accurately,
1207    performance during matrix assembly can be increased by more than a factor of 50.
1208 
1209    Collective
1210 
1211    Input Parameters:
1212 +  comm - MPI communicator, set to PETSC_COMM_SELF
1213 .  m - number of rows
1214 .  n - number of columns
1215 .  nz - number of nonzeros per row (same for all rows)
1216 -  nnz - array containing the number of nonzeros in the various rows
1217          (possibly different for each row) or NULL
1218 
1219    Output Parameter:
1220 .  A - the matrix
1221 
1222    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1223    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1224    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1225 
1226    Notes:
1227    If nnz is given then nz is ignored
1228 
1229    The AIJ format (also called the Yale sparse matrix format or
1230    compressed row storage), is fully compatible with standard Fortran 77
1231    storage.  That is, the stored row and column indices can begin at
1232    either one (as in Fortran) or zero.  See the users' manual for details.
1233 
1234    Specify the preallocated storage with either nz or nnz (not both).
1235    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1236    allocation.  For large problems you MUST preallocate memory or you
1237    will get TERRIBLE performance, see the users' manual chapter on matrices.
1238 
1239    By default, this format uses inodes (identical nodes) when possible, to
1240    improve numerical efficiency of matrix-vector products and solves. We
1241    search for consecutive rows with the same nonzero structure, thereby
1242    reusing matrix information to achieve increased efficiency.
1243 
1244    Level: intermediate
1245 
1246 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1247 @*/
1248 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1249 {
1250   PetscFunctionBegin;
1251   PetscCall(PetscKokkosInitializeCheck());
1252   PetscCall(MatCreate(comm,A));
1253   PetscCall(MatSetSizes(*A,m,n,m,n));
1254   PetscCall(MatSetType(*A,MATSEQAIJKOKKOS));
1255   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
1256   PetscFunctionReturn(0);
1257 }
1258 
1259 typedef Kokkos::TeamPolicy<>::member_type team_member;
1260 //
1261 // This factorization exploits block diagonal matrices with "Nf" (not used).
1262 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1263 //
1264 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1265 {
1266   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1267   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1268   IS                 isrow = b->row,isicol = b->icol;
1269   const PetscInt     *r_h,*ic_h;
1270   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1271   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1272   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1273   PetscBool          row_identity,col_identity;
1274   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1275 
1276   PetscFunctionBegin;
1277   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1278   PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity));
1279   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1280   PetscCall(ISGetIndices(isrow,&r_h));
1281   PetscCall(ISGetIndices(isicol,&ic_h));
1282   PetscCall(ISGetSize(isicol,&nc));
1283   PetscCall(PetscLogGpuTimeBegin());
1284   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1285   {
1286 #define KOKKOS_SHARED_LEVEL 1
1287     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1288     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1289     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1290     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1291     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1292     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1293     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1294     size_t flops_h = 0.0;
1295     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1296     Kokkos::View<size_t> d_flops_k ("flops");
1297     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1298     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1299     Kokkos::deep_copy (d_flops_k, h_flops_k);
1300     Kokkos::deep_copy (d_r_k, h_r_k);
1301     Kokkos::deep_copy (d_ic_k, h_ic_k);
1302     // Fill A --> fact
1303     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1304         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1305         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1306         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1307         // zero rows of B
1308         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1309             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1310             PetscScalar *baL = ba_d + bi_d[rowb];
1311             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1312             /* zero (unfactored row) */
1313             for (int j=0;j<nzbL;j++) baL[j] = 0;
1314             for (int j=0;j<nzbU;j++) baU[j] = 0;
1315           });
1316         // copy A into B
1317         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1318             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1319             const PetscScalar *av    = aa_d + ai_d[rowa];
1320             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1321             /* load in initial (unfactored row) */
1322             for (int j=0;j<nza;j++) {
1323               PetscInt    colb = ic[ajtmp[j]];
1324               PetscScalar vala = av[j];
1325               if (colb == rowb) {
1326                 *(ba_d + bdiag_d[rowb]) = vala;
1327               } else {
1328                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1329                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1330                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1331                 for (int j=0; j<nz ; j++) {
1332                   if (pbj[j] == colb) {
1333                     pba[j] = vala;
1334                     set++;
1335                     break;
1336                   }
1337                 }
1338                #if !defined(PETSC_HAVE_SYCL)
1339                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1340                #endif
1341               }
1342             }
1343           });
1344       });
1345     Kokkos::fence();
1346 
1347     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1348         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1349         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1350         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1351         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1352         const PetscInt  start = field*nloc, end = start + nloc;
1353         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1354         // A22 panel update for each row A(1,:) and col A(:,1)
1355         for (int ii=start; ii<end-1; ii++) {
1356           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1357           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1358           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1359           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1360           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1361               PetscInt kIdx = j*Ni + field_block_idx;
1362               if (kIdx >= nzUi) /* void */ ;
1363               else {
1364                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1365                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1366                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1367                 size_t         st_idx;
1368                 // find and do L(k,i) = A(:k,i) / A(i,i)
1369                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1370                 // get column, there has got to be a better way
1371                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1372                     if (pjL[j] == ii) {
1373                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1374                       idx = j; // output
1375                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1376                     }
1377                 }, st_idx);
1378                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1379 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1380                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1381 #endif
1382                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1383                 // U(i+1,:end)
1384                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1385                       PetscScalar Uij = baUi[uiIdx];
1386                       PetscInt    col = bjUi[uiIdx];
1387                       if (col==myk) {
1388                         // A_kk = A_kk - L_ki * U_ij(k)
1389                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1390                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1391                       } else {
1392                         PetscScalar    *start, *end, *pAkjv=NULL;
1393                         PetscInt       high, low;
1394                         const PetscInt *startj;
1395                         if (col<myk) { // L
1396                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1397                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1398                           start = pLki+1; // start at pLki+1, A22(myk,1)
1399                           startj= bj_d + bi_d[myk] + idx;
1400                           end   = ba_d + bi_d[myk+1];
1401                         } else {
1402                           PetscInt idx = bdiag_d[myk+1]+1;
1403                           start = ba_d + idx;
1404                           startj= bj_d + idx;
1405                           end   = ba_d + bdiag_d[myk];
1406                         }
1407                         // search for 'col', use bisection search - TODO
1408                         low  = 0;
1409                         high = (PetscInt)(end-start);
1410                         while (high-low > 5) {
1411                           int t = (low+high)/2;
1412                           if (startj[t] > col) high = t;
1413                           else                 low  = t;
1414                         }
1415                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1416                           if (startj[pAkjv-start] == col) break;
1417                         }
1418 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1419                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1420 #endif
1421                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1422                       }
1423                     });
1424               }
1425             });
1426           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1427           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1428         } /* endof for (i=0; i<n; i++) { */
1429         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1430       });
1431     Kokkos::fence();
1432     Kokkos::deep_copy (h_flops_k, d_flops_k);
1433     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1434     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1435         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1436         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1437         /* Invert diagonal for simpler triangular solves */
1438         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1439             int i = start + outer_index*Ni + lg_rank%Ni;
1440             if (i < end) {
1441               PetscScalar *pv = ba_d + bdiag_d[i];
1442               *pv = 1.0/(*pv);
1443             }
1444           });
1445       });
1446   }
1447   PetscCall(PetscLogGpuTimeEnd());
1448   PetscCall(ISRestoreIndices(isicol,&ic_h));
1449   PetscCall(ISRestoreIndices(isrow,&r_h));
1450 
1451   PetscCall(ISIdentity(isrow,&row_identity));
1452   PetscCall(ISIdentity(isicol,&col_identity));
1453   if (b->inode.size) {
1454     B->ops->solve = MatSolve_SeqAIJ_Inode;
1455   } else if (row_identity && col_identity) {
1456     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1457   } else {
1458     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1459   }
1460   B->offloadmask = PETSC_OFFLOAD_GPU;
1461   PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
1462   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1463   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1464   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1465   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1466   B->assembled              = PETSC_TRUE;
1467   B->preallocated           = PETSC_TRUE;
1468 
1469   PetscFunctionReturn(0);
1470 }
1471 
1472 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1473 {
1474   PetscFunctionBegin;
1475   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1476   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1477   PetscFunctionReturn(0);
1478 }
1479 
1480 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1481 {
1482   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1483 
1484   PetscFunctionBegin;
1485   if (!factors->sptrsv_symbolic_completed) {
1486     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1487     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1488     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1489   }
1490   PetscFunctionReturn(0);
1491 }
1492 
1493 /* Check if we need to update factors etc for transpose solve */
1494 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1495 {
1496   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1497   MatColIdxType             n = A->rmap->n;
1498 
1499   PetscFunctionBegin;
1500   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1501     /* Update L^T and do sptrsv symbolic */
1502     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1503     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1504     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1505     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1506 
1507     KokkosKernels::Impl::transpose_matrix<
1508       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1509       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1510       MatRowMapKokkosView,DefaultExecutionSpace>(
1511         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1512         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1513 
1514     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1515       We have to sort the indices, until KK provides finer control options.
1516     */
1517     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1518       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1519         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1520 
1521     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1522 
1523     /* Update U^T and do sptrsv symbolic */
1524     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1525     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1526     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1527     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1528 
1529     KokkosKernels::Impl::transpose_matrix<
1530       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1531       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1532       MatRowMapKokkosView,DefaultExecutionSpace>(
1533         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1534         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1535 
1536     /* Sort indices. See comments above */
1537     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1538       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1539         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1540 
1541     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1542     factors->transpose_updated = PETSC_TRUE;
1543   }
1544   PetscFunctionReturn(0);
1545 }
1546 
1547 /* Solve Ax = b, with A = LU */
1548 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1549 {
1550   ConstPetscScalarKokkosView     bv;
1551   PetscScalarKokkosView          xv;
1552   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1553 
1554   PetscFunctionBegin;
1555   PetscCall(PetscLogGpuTimeBegin());
1556   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1557   PetscCall(VecGetKokkosView(b,&bv));
1558   PetscCall(VecGetKokkosViewWrite(x,&xv));
1559   /* Solve L tmpv = b */
1560   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1561   /* Solve Ux = tmpv */
1562   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1563   PetscCall(VecRestoreKokkosView(b,&bv));
1564   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1565   PetscCall(PetscLogGpuTimeEnd());
1566   PetscFunctionReturn(0);
1567 }
1568 
1569 /* Solve A^T x = b, where A^T = U^T L^T */
1570 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1571 {
1572   ConstPetscScalarKokkosView     bv;
1573   PetscScalarKokkosView          xv;
1574   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1575 
1576   PetscFunctionBegin;
1577   PetscCall(PetscLogGpuTimeBegin());
1578   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1579   PetscCall(VecGetKokkosView(b,&bv));
1580   PetscCall(VecGetKokkosViewWrite(x,&xv));
1581   /* Solve U^T tmpv = b */
1582   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1583 
1584   /* Solve L^T x = tmpv */
1585   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1586   PetscCall(VecRestoreKokkosView(b,&bv));
1587   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1588   PetscCall(PetscLogGpuTimeEnd());
1589   PetscFunctionReturn(0);
1590 }
1591 
1592 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1593 {
1594   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1595   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1596   PetscInt                       fill_lev = info->levels;
1597 
1598   PetscFunctionBegin;
1599   PetscCall(PetscLogGpuTimeBegin());
1600   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1601 
1602   auto a_d = aijkok->a_dual.view_device();
1603   auto i_d = aijkok->i_dual.view_device();
1604   auto j_d = aijkok->j_dual.view_device();
1605 
1606   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1607 
1608   B->assembled                       = PETSC_TRUE;
1609   B->preallocated                    = PETSC_TRUE;
1610   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1611   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1612   B->ops->matsolve                   = NULL;
1613   B->ops->matsolvetranspose          = NULL;
1614   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1615 
1616   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1617   factors->transpose_updated         = PETSC_FALSE;
1618   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1619   /* TODO: log flops, but how to know that? */
1620   PetscCall(PetscLogGpuTimeEnd());
1621   PetscFunctionReturn(0);
1622 }
1623 
1624 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1625 {
1626   Mat_SeqAIJKokkos               *aijkok;
1627   Mat_SeqAIJ                     *b;
1628   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1629   PetscInt                       fill_lev = info->levels;
1630   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1631   PetscInt                       n = A->rmap->n;
1632 
1633   PetscFunctionBegin;
1634   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1635   /* Rebuild factors */
1636   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1637   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1638 
1639   /* Create a spiluk handle and then do symbolic factorization */
1640   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1641   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1642 
1643   auto spiluk_handle = factors->kh.get_spiluk_handle();
1644 
1645   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1646   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1647   Kokkos::realloc(factors->iU_d,n+1);
1648   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1649 
1650   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1651   auto i_d = aijkok->i_dual.view_device();
1652   auto j_d = aijkok->j_dual.view_device();
1653   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1654   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1655 
1656   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1657   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1658   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1659   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1660 
1661   /* TODO: add options to select sptrsv algorithms */
1662   /* Create sptrsv handles for L, U and their transpose */
1663  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1664   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1665  #else
1666   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1667  #endif
1668 
1669   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1670   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1671   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1672   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1673 
1674   /* Fill fields of the factor matrix B */
1675   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
1676   b     = (Mat_SeqAIJ*)B->data;
1677   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1678   B->info.fill_ratio_given  = info->fill;
1679   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1680 
1681   B->offloadmask            = PETSC_OFFLOAD_GPU;
1682   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1683   PetscFunctionReturn(0);
1684 }
1685 
1686 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1687 {
1688   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1689   const PetscInt   nrows   = A->rmap->n;
1690 
1691   PetscFunctionBegin;
1692   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1693   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1694   // move B data into Kokkos
1695   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1696   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1697   {
1698     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1699     if (!baijkok->diag_d.extent(0)) {
1700       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1701       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1702       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1703     }
1704   }
1705   PetscFunctionReturn(0);
1706 }
1707 
1708 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1709 {
1710   PetscFunctionBegin;
1711   *type = MATSOLVERKOKKOS;
1712   PetscFunctionReturn(0);
1713 }
1714 
1715 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1716 {
1717   PetscFunctionBegin;
1718   *type = MATSOLVERKOKKOSDEVICE;
1719   PetscFunctionReturn(0);
1720 }
1721 
1722 /*MC
1723   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1724   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1725 
1726   Level: beginner
1727 
1728 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1729 M*/
1730 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1731 {
1732   PetscInt       n = A->rmap->n;
1733 
1734   PetscFunctionBegin;
1735   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1736   PetscCall(MatSetSizes(*B,n,n,n,n));
1737   (*B)->factortype = ftype;
1738   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1739   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1740 
1741   if (ftype == MAT_FACTOR_LU) {
1742     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1743     (*B)->canuseordering         = PETSC_TRUE;
1744     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1745   } else if (ftype == MAT_FACTOR_ILU) {
1746     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1747     (*B)->canuseordering         = PETSC_FALSE;
1748     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1749   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1750 
1751   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1752   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1753   PetscFunctionReturn(0);
1754 }
1755 
1756 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1757 {
1758   PetscInt       n = A->rmap->n;
1759 
1760   PetscFunctionBegin;
1761   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1762   PetscCall(MatSetSizes(*B,n,n,n,n));
1763   (*B)->factortype = ftype;
1764   (*B)->canuseordering = PETSC_TRUE;
1765   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1766   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1767 
1768   if (ftype == MAT_FACTOR_LU) {
1769     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1770     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1771   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1772 
1773   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1774   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
1775   PetscFunctionReturn(0);
1776 }
1777 
1778 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1779 {
1780   PetscFunctionBegin;
1781   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
1782   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
1783   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
1784   PetscFunctionReturn(0);
1785 }
1786 
1787 /* Utility to print out a KokkosCsrMatrix for debugging */
1788 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1789 {
1790   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1791   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1792   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1793   const PetscInt    *i = iv.data();
1794   const PetscInt    *j = jv.data();
1795   const PetscScalar *a = av.data();
1796   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1797 
1798   PetscFunctionBegin;
1799   PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1800   for (PetscInt k=0; k<m; k++) {
1801     PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1802     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1803       PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]));
1804     }
1805     PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n"));
1806   }
1807   PetscFunctionReturn(0);
1808 }
1809