xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision ac38520c177ea222939a5883f974ba09f9da28d3)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   Mat_SeqAIJ       *aijseq;
29   Mat_SeqAIJKokkos *aijkok;
30 
31   PetscFunctionBegin;
32   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
33   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
34 
35   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
36   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
37 
38   /* If aijkok does not exist, we just copy i, j to device.
39      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
40      In both cases, we build a new aijkok structure.
41   */
42   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
43     delete aijkok;
44     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
45     A->spptr = aijkok;
46   }
47 
48   if (aijkok && aijkok->device_mat_d.data()) {
49     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
50   }
51   PetscFunctionReturn(0);
52 }
53 
54 /* Sync CSR data to device if not yet */
55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
56 {
57   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
58 
59   PetscFunctionBegin;
60   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
61   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
62   if (aijkok->a_dual.need_sync_device()) {
63     aijkok->a_dual.sync_device();
64     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
65     aijkok->hermitian_updated = PETSC_FALSE;
66   }
67   PetscFunctionReturn(0);
68 }
69 
70 /* Mark the CSR data on device as modified */
71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
72 {
73   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
74 
75   PetscFunctionBegin;
76   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
77   aijkok->a_dual.clear_sync_state();
78   aijkok->a_dual.modify_device();
79   aijkok->transpose_updated = PETSC_FALSE;
80   aijkok->hermitian_updated = PETSC_FALSE;
81   PetscCall(MatSeqAIJInvalidateDiagonal(A));
82   PetscCall(PetscObjectStateIncrease((PetscObject)A));
83   PetscFunctionReturn(0);
84 }
85 
86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
87 {
88   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
89 
90   PetscFunctionBegin;
91   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
92   /* We do not expect one needs factors on host  */
93   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
94   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
95   aijkok->a_dual.sync_host();
96   PetscFunctionReturn(0);
97 }
98 
99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
100 {
101   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
102 
103   PetscFunctionBegin;
104   if (aijkok) {
105     aijkok->a_dual.sync_host();
106     *array = aijkok->a_dual.view_host().data();
107   } else { /* Happens when calling MatSetValues on a newly created matrix */
108     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
109   }
110   PetscFunctionReturn(0);
111 }
112 
113 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
114 {
115   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
116 
117   PetscFunctionBegin;
118   if (aijkok) aijkok->a_dual.modify_host();
119   PetscFunctionReturn(0);
120 }
121 
122 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
123 {
124   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
125 
126   PetscFunctionBegin;
127   if (aijkok) {
128     aijkok->a_dual.sync_host();
129     *array = aijkok->a_dual.view_host().data();
130   } else {
131     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
132   }
133   PetscFunctionReturn(0);
134 }
135 
136 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
137 {
138   PetscFunctionBegin;
139   *array = NULL;
140   PetscFunctionReturn(0);
141 }
142 
143 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
144 {
145   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
146 
147   PetscFunctionBegin;
148   if (aijkok) {
149     *array = aijkok->a_dual.view_host().data();
150   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
151     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
152   }
153   PetscFunctionReturn(0);
154 }
155 
156 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
157 {
158   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
159 
160   PetscFunctionBegin;
161   if (aijkok) {
162     aijkok->a_dual.clear_sync_state();
163     aijkok->a_dual.modify_host();
164   }
165   PetscFunctionReturn(0);
166 }
167 
168 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
169 {
170   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
171 
172   PetscFunctionBegin;
173   PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL");
174 
175   if (i) *i = aijkok->i_device_data();
176   if (j) *j = aijkok->j_device_data();
177   if (a) {
178     aijkok->a_dual.sync_device();
179     *a = aijkok->a_device_data();
180   }
181   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
182   PetscFunctionReturn(0);
183 }
184 
185 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
186 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
187 {
188   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
189   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
190 
191   PetscFunctionBegin;
192   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
193   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
194   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
195   PetscFunctionReturn(0);
196 }
197 
198 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
199 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
200 {
201   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
202 
203   PetscFunctionBegin;
204   if (aijkok && aijkok->device_mat_d.data()) {
205     *d_mat = aijkok->device_mat_d.data();
206   } else {
207     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
208     *d_mat  = NULL;
209   }
210   PetscFunctionReturn(0);
211 }
212 
213 /* Generate the transpose on device and cache it internally */
214 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
215 {
216   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
217 
218   PetscFunctionBegin;
219   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
220   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
221     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
222     PetscCallCXX(aijkok->a_dual.sync_device());
223     PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
224     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
225     aijkok->transpose_updated = PETSC_TRUE;
226   }
227   *csrmatT = &aijkok->csrmatT;
228   PetscFunctionReturn(0);
229 }
230 
231 /* Generate the Hermitian on device and cache it internally */
232 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
233 {
234   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
235 
236   PetscFunctionBegin;
237   PetscCall(PetscLogGpuTimeBegin());
238   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
239   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
240     PetscCallCXX(aijkok->a_dual.sync_device());
241     PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
242     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
243    #if defined(PETSC_USE_COMPLEX)
244     const auto& a = aijkok->csrmatH.values;
245     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
246    #endif
247     aijkok->hermitian_updated = PETSC_TRUE;
248   }
249   *csrmatH = &aijkok->csrmatH;
250   PetscCall(PetscLogGpuTimeEnd());
251   PetscFunctionReturn(0);
252 }
253 
254 /* y = A x */
255 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
256 {
257   Mat_SeqAIJKokkos                 *aijkok;
258   ConstPetscScalarKokkosView       xv;
259   PetscScalarKokkosView            yv;
260 
261   PetscFunctionBegin;
262   PetscCall(PetscLogGpuTimeBegin());
263   PetscCall(MatSeqAIJKokkosSyncDevice(A));
264   PetscCall(VecGetKokkosView(xx,&xv));
265   PetscCall(VecGetKokkosViewWrite(yy,&yv));
266   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
267   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
268   PetscCall(VecRestoreKokkosView(xx,&xv));
269   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
270   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
271   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
272   PetscCall(PetscLogGpuTimeEnd());
273   PetscFunctionReturn(0);
274 }
275 
276 /* y = A^T x */
277 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
278 {
279   Mat_SeqAIJKokkos                 *aijkok;
280   const char                       *mode;
281   ConstPetscScalarKokkosView       xv;
282   PetscScalarKokkosView            yv;
283   KokkosCsrMatrix                  *csrmat;
284 
285   PetscFunctionBegin;
286   PetscCall(PetscLogGpuTimeBegin());
287   PetscCall(MatSeqAIJKokkosSyncDevice(A));
288   PetscCall(VecGetKokkosView(xx,&xv));
289   PetscCall(VecGetKokkosViewWrite(yy,&yv));
290   if (A->form_explicit_transpose) {
291     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
292     mode = "N";
293   } else {
294     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
295     csrmat = &aijkok->csrmat;
296     mode = "T";
297   }
298   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
299   PetscCall(VecRestoreKokkosView(xx,&xv));
300   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
301   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
302   PetscCall(PetscLogGpuTimeEnd());
303   PetscFunctionReturn(0);
304 }
305 
306 /* y = A^H x */
307 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
308 {
309   Mat_SeqAIJKokkos                 *aijkok;
310   const char                       *mode;
311   ConstPetscScalarKokkosView       xv;
312   PetscScalarKokkosView            yv;
313   KokkosCsrMatrix                  *csrmat;
314 
315   PetscFunctionBegin;
316   PetscCall(PetscLogGpuTimeBegin());
317   PetscCall(MatSeqAIJKokkosSyncDevice(A));
318   PetscCall(VecGetKokkosView(xx,&xv));
319   PetscCall(VecGetKokkosViewWrite(yy,&yv));
320   if (A->form_explicit_transpose) {
321     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
322     mode = "N";
323   } else {
324     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
325     csrmat = &aijkok->csrmat;
326     mode = "C";
327   }
328   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
329   PetscCall(VecRestoreKokkosView(xx,&xv));
330   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
331   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
332   PetscCall(PetscLogGpuTimeEnd());
333   PetscFunctionReturn(0);
334 }
335 
336 /* z = A x + y */
337 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
338 {
339   Mat_SeqAIJKokkos                 *aijkok;
340   ConstPetscScalarKokkosView       xv,yv;
341   PetscScalarKokkosView            zv;
342 
343   PetscFunctionBegin;
344   PetscCall(PetscLogGpuTimeBegin());
345   PetscCall(MatSeqAIJKokkosSyncDevice(A));
346   PetscCall(VecGetKokkosView(xx,&xv));
347   PetscCall(VecGetKokkosView(yy,&yv));
348   PetscCall(VecGetKokkosViewWrite(zz,&zv));
349   if (zz != yy) Kokkos::deep_copy(zv,yv);
350   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
351   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
352   PetscCall(VecRestoreKokkosView(xx,&xv));
353   PetscCall(VecRestoreKokkosView(yy,&yv));
354   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
355   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
356   PetscCall(PetscLogGpuTimeEnd());
357   PetscFunctionReturn(0);
358 }
359 
360 /* z = A^T x + y */
361 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
362 {
363   Mat_SeqAIJKokkos                 *aijkok;
364   const char                       *mode;
365   ConstPetscScalarKokkosView       xv,yv;
366   PetscScalarKokkosView            zv;
367   KokkosCsrMatrix                  *csrmat;
368 
369   PetscFunctionBegin;
370   PetscCall(PetscLogGpuTimeBegin());
371   PetscCall(MatSeqAIJKokkosSyncDevice(A));
372   PetscCall(VecGetKokkosView(xx,&xv));
373   PetscCall(VecGetKokkosView(yy,&yv));
374   PetscCall(VecGetKokkosViewWrite(zz,&zv));
375   if (zz != yy) Kokkos::deep_copy(zv,yv);
376   if (A->form_explicit_transpose) {
377     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
378     mode = "N";
379   } else {
380     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
381     csrmat = &aijkok->csrmat;
382     mode = "T";
383   }
384   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
385   PetscCall(VecRestoreKokkosView(xx,&xv));
386   PetscCall(VecRestoreKokkosView(yy,&yv));
387   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
388   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
389   PetscCall(PetscLogGpuTimeEnd());
390   PetscFunctionReturn(0);
391 }
392 
393 /* z = A^H x + y */
394 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
395 {
396   Mat_SeqAIJKokkos                 *aijkok;
397   const char                       *mode;
398   ConstPetscScalarKokkosView       xv,yv;
399   PetscScalarKokkosView            zv;
400   KokkosCsrMatrix                  *csrmat;
401 
402   PetscFunctionBegin;
403   PetscCall(PetscLogGpuTimeBegin());
404   PetscCall(MatSeqAIJKokkosSyncDevice(A));
405   PetscCall(VecGetKokkosView(xx,&xv));
406   PetscCall(VecGetKokkosView(yy,&yv));
407   PetscCall(VecGetKokkosViewWrite(zz,&zv));
408   if (zz != yy) Kokkos::deep_copy(zv,yv);
409   if (A->form_explicit_transpose) {
410     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
411     mode = "N";
412   } else {
413     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
414     csrmat = &aijkok->csrmat;
415     mode = "C";
416   }
417   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
418   PetscCall(VecRestoreKokkosView(xx,&xv));
419   PetscCall(VecRestoreKokkosView(yy,&yv));
420   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
421   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
422   PetscCall(PetscLogGpuTimeEnd());
423   PetscFunctionReturn(0);
424 }
425 
426 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
427 {
428   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
429 
430   PetscFunctionBegin;
431   switch (op) {
432     case MAT_FORM_EXPLICIT_TRANSPOSE:
433       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
434       if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
435       A->form_explicit_transpose = flg;
436       break;
437     default:
438       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
439       break;
440   }
441   PetscFunctionReturn(0);
442 }
443 
444 /* Depending on reuse, either build a new mat, or use the existing mat */
445 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
446 {
447   Mat_SeqAIJ       *aseq;
448 
449   PetscFunctionBegin;
450   PetscCall(PetscKokkosInitializeCheck());
451   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
452     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
453   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
454     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
455   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
456     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
457     PetscCall(PetscFree(A->defaultvectype));
458     PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
459     PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
460     PetscCall(MatSetOps_SeqAIJKokkos(A));
461     aseq = static_cast<Mat_SeqAIJ*>(A->data);
462     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
463       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
464       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
465     }
466   }
467   PetscFunctionReturn(0);
468 }
469 
470 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
471    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
472  */
473 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
474 {
475   Mat_SeqAIJ            *bseq;
476   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
477   Mat                   mat;
478 
479   PetscFunctionBegin;
480   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
481   PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
482   mat  = *B;
483   if (A->assembled) {
484     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
485     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
486     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
487     /* Now copy values to B if needed */
488     if (dupOption == MAT_COPY_VALUES) {
489       if (akok->a_dual.need_sync_device()) {
490         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
491         bkok->a_dual.modify_host();
492       } else { /* If device has the latest data, we only copy data on device */
493         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
494         bkok->a_dual.modify_device();
495       }
496     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
497       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
498       bkok->a_dual.modify_host();
499     }
500     mat->spptr = bkok;
501   }
502 
503   PetscCall(PetscFree(mat->defaultvectype));
504   PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
505   PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
506   PetscCall(MatSetOps_SeqAIJKokkos(mat));
507   PetscFunctionReturn(0);
508 }
509 
510 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
511 {
512   Mat               At;
513   KokkosCsrMatrix   *internT;
514   Mat_SeqAIJKokkos  *atkok,*bkok;
515 
516   PetscFunctionBegin;
517   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
518   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
519     /* Deep copy internT, as we want to isolate the internal transpose */
520     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
521     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
522     if (reuse == MAT_INITIAL_MATRIX) *B = At;
523     else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
524   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
525     if ((*B)->assembled) {
526       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
527       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
528       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
529     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
530       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
531       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
532       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
533       PetscCallCXX(Kokkos::deep_copy(a_h,internT->values));
534       PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
535     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
536   }
537   PetscFunctionReturn(0);
538 }
539 
540 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
541 {
542   Mat_SeqAIJKokkos           *aijkok;
543 
544   PetscFunctionBegin;
545   if (A->factortype == MAT_FACTOR_NONE) {
546     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
547     delete aijkok;
548   } else {
549     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
550   }
551   A->spptr = NULL;
552   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
553   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
554   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
555   PetscCall(MatDestroy_SeqAIJ(A));
556   PetscFunctionReturn(0);
557 }
558 
559 /*MC
560    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
561 
562    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
563 
564    Options Database Keys:
565 .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
566 
567   Level: beginner
568 
569 .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS
570 M*/
571 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
572 {
573   PetscFunctionBegin;
574   PetscCall(PetscKokkosInitializeCheck());
575   PetscCall(MatCreate_SeqAIJ(A));
576   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
577   PetscFunctionReturn(0);
578 }
579 
580 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
581 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
582 {
583   Mat_SeqAIJ                   *a,*b;
584   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
585   MatScalarKokkosView          aa,ba,ca;
586   MatRowMapKokkosView          ai,bi,ci;
587   MatColIdxKokkosView          aj,bj,cj;
588   PetscInt                     m,n,nnz,aN;
589 
590   PetscFunctionBegin;
591   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
592   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
593   PetscValidPointer(C,4);
594   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
595   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
596   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
597   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
598 
599   PetscCall(MatSeqAIJKokkosSyncDevice(A));
600   PetscCall(MatSeqAIJKokkosSyncDevice(B));
601   a    = static_cast<Mat_SeqAIJ*>(A->data);
602   b    = static_cast<Mat_SeqAIJ*>(B->data);
603   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
604   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
605   aa   = akok->a_dual.view_device();
606   ai   = akok->i_dual.view_device();
607   ba   = bkok->a_dual.view_device();
608   bi   = bkok->i_dual.view_device();
609   m    = A->rmap->n; /* M, N and nnz of C */
610   n    = A->cmap->n + B->cmap->n;
611   nnz  = a->nz + b->nz;
612   aN   = A->cmap->n; /* N of A */
613   if (reuse == MAT_INITIAL_MATRIX) {
614     aj = akok->j_dual.view_device();
615     bj = bkok->j_dual.view_device();
616     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
617     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
618     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
619     ca = ca_dual.view_device();
620     ci = ci_dual.view_device();
621     cj = cj_dual.view_device();
622 
623     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
624     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
625       PetscInt i = t.league_rank(); /* row i */
626       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
627 
628       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
629         ci(i) = coffset;
630         if (i == m-1) ci(m) = ai(m) + bi(m);
631       });
632 
633       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
634         if (k < alen) {
635           ca(coffset+k) = aa(ai(i)+k);
636           cj(coffset+k) = aj(ai(i)+k);
637         } else {
638           ca(coffset+k) = ba(bi(i)+k-alen);
639           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
640         }
641       });
642     });
643     ca_dual.modify_device();
644     ci_dual.modify_device();
645     cj_dual.modify_device();
646     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
647     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
648   } else if (reuse == MAT_REUSE_MATRIX) {
649     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
650     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
651     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
652     ca   = ckok->a_dual.view_device();
653     ci   = ckok->i_dual.view_device();
654 
655     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
656       PetscInt i = t.league_rank(); /* row i */
657       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
658       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
659         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
660         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
661       });
662     });
663     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
664   }
665   PetscFunctionReturn(0);
666 }
667 
668 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
669 {
670   PetscFunctionBegin;
671   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
672   PetscFunctionReturn(0);
673 }
674 
675 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
676 {
677   Mat_Product                    *product = C->product;
678   Mat                            A,B;
679   bool                           transA,transB; /* use bool, since KK needs this type */
680   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
681   Mat_SeqAIJ                     *c;
682   MatProductData_SeqAIJKokkos    *pdata;
683   KokkosCsrMatrix                *csrmatA,*csrmatB;
684 
685   PetscFunctionBegin;
686   MatCheckProduct(C,1);
687   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
688   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
689 
690   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
691     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
692     PetscFunctionReturn(0);
693   }
694 
695   switch (product->type) {
696     case MATPRODUCT_AB:  transA = false; transB = false; break;
697     case MATPRODUCT_AtB: transA = true;  transB = false; break;
698     case MATPRODUCT_ABt: transA = false; transB = true;  break;
699     default:
700       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
701   }
702 
703   A     = product->A;
704   B     = product->B;
705   PetscCall(MatSeqAIJKokkosSyncDevice(A));
706   PetscCall(MatSeqAIJKokkosSyncDevice(B));
707   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
708   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
709   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
710 
711   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
712 
713   csrmatA = &akok->csrmat;
714   csrmatB = &bkok->csrmat;
715 
716   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
717   if (transA) {
718     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
719     transA = false;
720   }
721 
722   if (transB) {
723     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
724     transB = false;
725   }
726   PetscCall(PetscLogGpuTimeBegin());
727   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
728   PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
729   PetscCall(PetscLogGpuTimeEnd());
730   PetscCall(MatSeqAIJKokkosModifyDevice(C));
731   /* shorter version of MatAssemblyEnd_SeqAIJ */
732   c = (Mat_SeqAIJ*)C->data;
733   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
734   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
735   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
736   c->reallocs         = 0;
737   C->info.mallocs     = 0;
738   C->info.nz_unneeded = 0;
739   C->assembled        = C->was_assembled = PETSC_TRUE;
740   C->num_ass++;
741   PetscFunctionReturn(0);
742 }
743 
744 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
745 {
746   Mat_Product                    *product = C->product;
747   MatProductType                 ptype;
748   Mat                            A,B;
749   bool                           transA,transB;
750   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
751   MatProductData_SeqAIJKokkos    *pdata;
752   MPI_Comm                       comm;
753   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
754 
755   PetscFunctionBegin;
756   MatCheckProduct(C,1);
757   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
758   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
759   A       = product->A;
760   B       = product->B;
761   PetscCall(MatSeqAIJKokkosSyncDevice(A));
762   PetscCall(MatSeqAIJKokkosSyncDevice(B));
763   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
764   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
765   csrmatA = &akok->csrmat;
766   csrmatB = &bkok->csrmat;
767 
768   ptype   = product->type;
769   switch (ptype) {
770     case MATPRODUCT_AB:  transA = false; transB = false; break;
771     case MATPRODUCT_AtB: transA = true;  transB = false; break;
772     case MATPRODUCT_ABt: transA = false; transB = true;  break;
773     default:
774       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
775   }
776 
777   product->data = pdata = new MatProductData_SeqAIJKokkos();
778   pdata->kh.set_team_work_size(16);
779   pdata->kh.set_dynamic_scheduling(true);
780   pdata->reusesym = product->api_user;
781 
782   /* TODO: add command line options to select spgemm algorithms */
783   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
784   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
785      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
786    */
787   pdata->kh.create_spgemm_handle(spgemm_alg);
788 
789   PetscCall(PetscLogGpuTimeBegin());
790   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
791   if (transA) {
792     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
793     transA = false;
794   }
795 
796   if (transB) {
797     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
798     transB = false;
799   }
800 
801   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
802   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
803     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
804     calling new Mat_SeqAIJKokkos().
805     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
806   */
807   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
808   PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC));
809   PetscCall(PetscLogGpuTimeEnd());
810 
811   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
812   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
813   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
814   PetscFunctionReturn(0);
815 }
816 
817 /* handles sparse matrix matrix ops */
818 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
819 {
820   Mat_Product    *product = mat->product;
821   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
822 
823   PetscFunctionBegin;
824   MatCheckProduct(mat,1);
825   PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
826   if (product->type == MATPRODUCT_ABC) {
827     PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
828   }
829   if (Biskok && Ciskok) {
830     switch (product->type) {
831     case MATPRODUCT_AB:
832     case MATPRODUCT_AtB:
833     case MATPRODUCT_ABt:
834       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
835       break;
836     case MATPRODUCT_PtAP:
837     case MATPRODUCT_RARt:
838     case MATPRODUCT_ABC:
839       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
840       break;
841     default:
842       break;
843     }
844   } else { /* fallback for AIJ */
845     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
846   }
847   PetscFunctionReturn(0);
848 }
849 
850 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
851 {
852   Mat_SeqAIJKokkos *aijkok;
853 
854   PetscFunctionBegin;
855   PetscCall(PetscLogGpuTimeBegin());
856   PetscCall(MatSeqAIJKokkosSyncDevice(A));
857   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
858   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
859   PetscCall(MatSeqAIJKokkosModifyDevice(A));
860   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
861   PetscCall(PetscLogGpuTimeEnd());
862   PetscFunctionReturn(0);
863 }
864 
865 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
866 {
867   Mat_SeqAIJKokkos *aijkok;
868 
869   PetscFunctionBegin;
870   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
871   if (aijkok) { /* Only zero the device if data is already there */
872     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
873     PetscCall(MatSeqAIJKokkosModifyDevice(A));
874   } else { /* Might be preallocated but not assembled */
875     PetscCall(MatZeroEntries_SeqAIJ(A));
876   }
877   PetscFunctionReturn(0);
878 }
879 
880 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
881 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
882 {
883   Mat_SeqAIJKokkos   *aijkok;
884 
885   PetscFunctionBegin;
886   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
887   PetscValidPointer(kv,2);
888   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
889   PetscCall(MatSeqAIJKokkosSyncDevice(A));
890   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
891   *kv    = aijkok->a_dual.view_device();
892   PetscFunctionReturn(0);
893 }
894 
895 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
896 {
897   PetscFunctionBegin;
898   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
899   PetscValidPointer(kv,2);
900   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
901   PetscFunctionReturn(0);
902 }
903 
904 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
905 {
906   Mat_SeqAIJKokkos   *aijkok;
907 
908   PetscFunctionBegin;
909   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
910   PetscValidPointer(kv,2);
911   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
912   PetscCall(MatSeqAIJKokkosSyncDevice(A));
913   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
914   *kv    = aijkok->a_dual.view_device();
915   PetscFunctionReturn(0);
916 }
917 
918 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
919 {
920   PetscFunctionBegin;
921   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
922   PetscValidPointer(kv,2);
923   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
924   PetscCall(MatSeqAIJKokkosModifyDevice(A));
925   PetscFunctionReturn(0);
926 }
927 
928 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
929 {
930   Mat_SeqAIJKokkos   *aijkok;
931 
932   PetscFunctionBegin;
933   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
934   PetscValidPointer(kv,2);
935   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
936   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
937   *kv    = aijkok->a_dual.view_device();
938   PetscFunctionReturn(0);
939 }
940 
941 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
942 {
943   PetscFunctionBegin;
944   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
945   PetscValidPointer(kv,2);
946   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
947   PetscCall(MatSeqAIJKokkosModifyDevice(A));
948   PetscFunctionReturn(0);
949 }
950 
951 /* Computes Y += alpha X */
952 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
953 {
954   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
955   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
956   ConstMatScalarKokkosView   Xa;
957   MatScalarKokkosView        Ya;
958 
959   PetscFunctionBegin;
960   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
961   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
962   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
963   PetscCall(MatSeqAIJKokkosSyncDevice(X));
964   PetscCall(PetscLogGpuTimeBegin());
965 
966   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
967     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
968     PetscBool e;
969     PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
970     if (e) {
971       PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e));
972       if (e) pattern = SAME_NONZERO_PATTERN;
973     }
974   }
975 
976   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
977     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
978     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
979     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
980   */
981   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
982   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
983   Xa   = xkok->a_dual.view_device();
984   Ya   = ykok->a_dual.view_device();
985 
986   if (pattern == SAME_NONZERO_PATTERN) {
987     KokkosBlas::axpy(alpha,Xa,Ya);
988     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
989   } else if (pattern == SUBSET_NONZERO_PATTERN) {
990     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
991     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
992 
993     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
994       PetscInt i = t.league_rank(); /* row i */
995       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
996         PetscInt p,q = Yi(i);
997         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
998           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
999           if (Xj(p) == Yj(q)) { /* Found it */
1000             Ya(q) += alpha * Xa(p);
1001             q++;
1002           } else {
1003             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1004                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1005             */
1006             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1007           }
1008         }
1009       });
1010     });
1011     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1012   } else { /* different nonzero patterns */
1013     Mat             Z;
1014     KokkosCsrMatrix zcsr;
1015     KernelHandle    kh;
1016     kh.create_spadd_handle(false);
1017     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1018     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1019     zkok = new Mat_SeqAIJKokkos(zcsr);
1020     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
1021     PetscCall(MatHeaderReplace(Y,&Z));
1022     kh.destroy_spadd_handle();
1023   }
1024   PetscCall(PetscLogGpuTimeEnd());
1025   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1026   PetscFunctionReturn(0);
1027 }
1028 
1029 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1030 {
1031   Mat_SeqAIJKokkos *akok;
1032   Mat_SeqAIJ       *aseq;
1033 
1034   PetscFunctionBegin;
1035   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1036   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1037   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1038   delete akok;
1039   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
1040   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1041   akok->SetUpCOO(aseq);
1042   PetscFunctionReturn(0);
1043 }
1044 
1045 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1046 {
1047   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1048   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1049   PetscCount                  Annz = aseq->nz;
1050   const PetscCountKokkosView& jmap = akok->jmap_d;
1051   const PetscCountKokkosView& perm = akok->perm_d;
1052   MatScalarKokkosView         Aa;
1053   ConstMatScalarKokkosView    kv;
1054   PetscMemType                memtype;
1055 
1056   PetscFunctionBegin;
1057   PetscCall(PetscGetMemType(v,&memtype));
1058   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1059     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1060   } else {
1061     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1062   }
1063 
1064   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1065   else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
1066 
1067   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1068     PetscScalar sum = 0.0;
1069     for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k));
1070     Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum;
1071   });
1072 
1073   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1074   else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa));
1075   PetscFunctionReturn(0);
1076 }
1077 
1078 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag)
1079 {
1080   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1081   MatScalarKokkosView         Aa;
1082   const MatRowMapKokkosView&  Ai = akok->i_dual.view_device();
1083   PetscInt                    m = A->rmap->n;
1084   ConstMatRowMapKokkosView    Adiag(diag,m); /* diag is a device pointer */
1085 
1086   PetscFunctionBegin;
1087   PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa));
1088   Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) {
1089     PetscScalar tmp;
1090     if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */
1091       tmp          = Aa(Ai(i));
1092       Aa(Ai(i))    = Aa(Adiag(i));
1093       Aa(Adiag(i)) = tmp;
1094     }
1095   });
1096   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1097   PetscFunctionReturn(0);
1098 }
1099 
1100 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1101 {
1102   PetscFunctionBegin;
1103   PetscCall(MatSeqAIJKokkosSyncHost(A));
1104   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
1105   B->offloadmask = PETSC_OFFLOAD_CPU;
1106   PetscFunctionReturn(0);
1107 }
1108 
1109 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1110 {
1111   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1112 
1113   PetscFunctionBegin;
1114   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1115   A->boundtocpu  = PETSC_FALSE;
1116 
1117   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1118   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1119   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1120   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1121   A->ops->scale                     = MatScale_SeqAIJKokkos;
1122   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1123   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1124   A->ops->mult                      = MatMult_SeqAIJKokkos;
1125   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1126   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1127   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1128   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1129   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1130   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1131   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1132   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1133   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1134   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1135   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1136   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1137   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1138   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1139   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1140 
1141   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
1142   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1143   PetscFunctionReturn(0);
1144 }
1145 
1146 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1147 {
1148   Mat_SeqAIJ *aseq;
1149   PetscInt    i,m,n;
1150 
1151   PetscFunctionBegin;
1152   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1153 
1154   m = akok->nrows();
1155   n = akok->ncols();
1156   PetscCall(MatSetSizes(A,m,n,m,n));
1157   PetscCall(MatSetType(A,MATSEQAIJKOKKOS));
1158 
1159   /* Set up data structures of A as a MATSEQAIJ */
1160   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1161   aseq = (Mat_SeqAIJ*)(A)->data;
1162 
1163   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1164   akok->j_dual.sync_host();
1165 
1166   aseq->i            = akok->i_host_data();
1167   aseq->j            = akok->j_host_data();
1168   aseq->a            = akok->a_host_data();
1169   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1170   aseq->singlemalloc = PETSC_FALSE;
1171   aseq->free_a       = PETSC_FALSE;
1172   aseq->free_ij      = PETSC_FALSE;
1173   aseq->nz           = akok->nnz();
1174   aseq->maxnz        = aseq->nz;
1175 
1176   PetscCall(PetscMalloc1(m,&aseq->imax));
1177   PetscCall(PetscMalloc1(m,&aseq->ilen));
1178   for (i=0; i<m; i++) {
1179     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1180   }
1181 
1182   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1183   akok->nonzerostate = A->nonzerostate;
1184   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1185   PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
1186   PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1187   PetscFunctionReturn(0);
1188 }
1189 
1190 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1191 
1192    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1193  */
1194 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1195 {
1196   PetscFunctionBegin;
1197   PetscCall(MatCreate(comm,A));
1198   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
1199   PetscFunctionReturn(0);
1200 }
1201 
1202 /* --------------------------------------------------------------------------------*/
1203 /*@C
1204    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1205    (the default parallel PETSc format). This matrix will ultimately be handled by
1206    Kokkos for calculations. For good matrix
1207    assembly performance the user should preallocate the matrix storage by setting
1208    the parameter nz (or the array nnz).  By setting these parameters accurately,
1209    performance during matrix assembly can be increased by more than a factor of 50.
1210 
1211    Collective
1212 
1213    Input Parameters:
1214 +  comm - MPI communicator, set to PETSC_COMM_SELF
1215 .  m - number of rows
1216 .  n - number of columns
1217 .  nz - number of nonzeros per row (same for all rows)
1218 -  nnz - array containing the number of nonzeros in the various rows
1219          (possibly different for each row) or NULL
1220 
1221    Output Parameter:
1222 .  A - the matrix
1223 
1224    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1225    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1226    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1227 
1228    Notes:
1229    If nnz is given then nz is ignored
1230 
1231    The AIJ format (also called the Yale sparse matrix format or
1232    compressed row storage), is fully compatible with standard Fortran 77
1233    storage.  That is, the stored row and column indices can begin at
1234    either one (as in Fortran) or zero.  See the users' manual for details.
1235 
1236    Specify the preallocated storage with either nz or nnz (not both).
1237    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1238    allocation.  For large problems you MUST preallocate memory or you
1239    will get TERRIBLE performance, see the users' manual chapter on matrices.
1240 
1241    By default, this format uses inodes (identical nodes) when possible, to
1242    improve numerical efficiency of matrix-vector products and solves. We
1243    search for consecutive rows with the same nonzero structure, thereby
1244    reusing matrix information to achieve increased efficiency.
1245 
1246    Level: intermediate
1247 
1248 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1249 @*/
1250 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1251 {
1252   PetscFunctionBegin;
1253   PetscCall(PetscKokkosInitializeCheck());
1254   PetscCall(MatCreate(comm,A));
1255   PetscCall(MatSetSizes(*A,m,n,m,n));
1256   PetscCall(MatSetType(*A,MATSEQAIJKOKKOS));
1257   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
1258   PetscFunctionReturn(0);
1259 }
1260 
1261 typedef Kokkos::TeamPolicy<>::member_type team_member;
1262 //
1263 // This factorization exploits block diagonal matrices with "Nf" (not used).
1264 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1265 //
1266 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1267 {
1268   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1269   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1270   IS                 isrow = b->row,isicol = b->icol;
1271   const PetscInt     *r_h,*ic_h;
1272   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1273   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1274   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1275   PetscBool          row_identity,col_identity;
1276   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1277 
1278   PetscFunctionBegin;
1279   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1280   PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity));
1281   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1282   PetscCall(ISGetIndices(isrow,&r_h));
1283   PetscCall(ISGetIndices(isicol,&ic_h));
1284   PetscCall(ISGetSize(isicol,&nc));
1285   PetscCall(PetscLogGpuTimeBegin());
1286   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1287   {
1288 #define KOKKOS_SHARED_LEVEL 1
1289     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1290     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1291     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1292     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1293     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1294     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1295     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1296     size_t flops_h = 0.0;
1297     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1298     Kokkos::View<size_t> d_flops_k ("flops");
1299     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1300     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1301     Kokkos::deep_copy (d_flops_k, h_flops_k);
1302     Kokkos::deep_copy (d_r_k, h_r_k);
1303     Kokkos::deep_copy (d_ic_k, h_ic_k);
1304     // Fill A --> fact
1305     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1306         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1307         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1308         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1309         // zero rows of B
1310         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1311             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1312             PetscScalar *baL = ba_d + bi_d[rowb];
1313             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1314             /* zero (unfactored row) */
1315             for (int j=0;j<nzbL;j++) baL[j] = 0;
1316             for (int j=0;j<nzbU;j++) baU[j] = 0;
1317           });
1318         // copy A into B
1319         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1320             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1321             const PetscScalar *av    = aa_d + ai_d[rowa];
1322             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1323             /* load in initial (unfactored row) */
1324             for (int j=0;j<nza;j++) {
1325               PetscInt    colb = ic[ajtmp[j]];
1326               PetscScalar vala = av[j];
1327               if (colb == rowb) {
1328                 *(ba_d + bdiag_d[rowb]) = vala;
1329               } else {
1330                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1331                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1332                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1333                 for (int j=0; j<nz ; j++) {
1334                   if (pbj[j] == colb) {
1335                     pba[j] = vala;
1336                     set++;
1337                     break;
1338                   }
1339                 }
1340                #if !defined(PETSC_HAVE_SYCL)
1341                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1342                #endif
1343               }
1344             }
1345           });
1346       });
1347     Kokkos::fence();
1348 
1349     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1350         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1351         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1352         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1353         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1354         const PetscInt  start = field*nloc, end = start + nloc;
1355         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1356         // A22 panel update for each row A(1,:) and col A(:,1)
1357         for (int ii=start; ii<end-1; ii++) {
1358           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1359           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1360           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1361           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1362           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1363               PetscInt kIdx = j*Ni + field_block_idx;
1364               if (kIdx >= nzUi) /* void */ ;
1365               else {
1366                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1367                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1368                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1369                 size_t         st_idx;
1370                 // find and do L(k,i) = A(:k,i) / A(i,i)
1371                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1372                 // get column, there has got to be a better way
1373                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1374                     if (pjL[j] == ii) {
1375                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1376                       idx = j; // output
1377                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1378                     }
1379                 }, st_idx);
1380                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1381 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1382                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1383 #endif
1384                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1385                 // U(i+1,:end)
1386                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1387                       PetscScalar Uij = baUi[uiIdx];
1388                       PetscInt    col = bjUi[uiIdx];
1389                       if (col==myk) {
1390                         // A_kk = A_kk - L_ki * U_ij(k)
1391                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1392                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1393                       } else {
1394                         PetscScalar    *start, *end, *pAkjv=NULL;
1395                         PetscInt       high, low;
1396                         const PetscInt *startj;
1397                         if (col<myk) { // L
1398                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1399                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1400                           start = pLki+1; // start at pLki+1, A22(myk,1)
1401                           startj= bj_d + bi_d[myk] + idx;
1402                           end   = ba_d + bi_d[myk+1];
1403                         } else {
1404                           PetscInt idx = bdiag_d[myk+1]+1;
1405                           start = ba_d + idx;
1406                           startj= bj_d + idx;
1407                           end   = ba_d + bdiag_d[myk];
1408                         }
1409                         // search for 'col', use bisection search - TODO
1410                         low  = 0;
1411                         high = (PetscInt)(end-start);
1412                         while (high-low > 5) {
1413                           int t = (low+high)/2;
1414                           if (startj[t] > col) high = t;
1415                           else                 low  = t;
1416                         }
1417                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1418                           if (startj[pAkjv-start] == col) break;
1419                         }
1420 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1421                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1422 #endif
1423                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1424                       }
1425                     });
1426               }
1427             });
1428           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1429           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1430         } /* endof for (i=0; i<n; i++) { */
1431         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1432       });
1433     Kokkos::fence();
1434     Kokkos::deep_copy (h_flops_k, d_flops_k);
1435     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1436     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1437         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1438         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1439         /* Invert diagonal for simpler triangular solves */
1440         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1441             int i = start + outer_index*Ni + lg_rank%Ni;
1442             if (i < end) {
1443               PetscScalar *pv = ba_d + bdiag_d[i];
1444               *pv = 1.0/(*pv);
1445             }
1446           });
1447       });
1448   }
1449   PetscCall(PetscLogGpuTimeEnd());
1450   PetscCall(ISRestoreIndices(isicol,&ic_h));
1451   PetscCall(ISRestoreIndices(isrow,&r_h));
1452 
1453   PetscCall(ISIdentity(isrow,&row_identity));
1454   PetscCall(ISIdentity(isicol,&col_identity));
1455   if (b->inode.size) {
1456     B->ops->solve = MatSolve_SeqAIJ_Inode;
1457   } else if (row_identity && col_identity) {
1458     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1459   } else {
1460     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1461   }
1462   B->offloadmask = PETSC_OFFLOAD_GPU;
1463   PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
1464   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1465   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1466   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1467   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1468   B->assembled              = PETSC_TRUE;
1469   B->preallocated           = PETSC_TRUE;
1470 
1471   PetscFunctionReturn(0);
1472 }
1473 
1474 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1475 {
1476   PetscFunctionBegin;
1477   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1478   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1479   PetscFunctionReturn(0);
1480 }
1481 
1482 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1483 {
1484   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1485 
1486   PetscFunctionBegin;
1487   if (!factors->sptrsv_symbolic_completed) {
1488     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1489     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1490     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1491   }
1492   PetscFunctionReturn(0);
1493 }
1494 
1495 /* Check if we need to update factors etc for transpose solve */
1496 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1497 {
1498   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1499   MatColIdxType             n = A->rmap->n;
1500 
1501   PetscFunctionBegin;
1502   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1503     /* Update L^T and do sptrsv symbolic */
1504     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1505     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1506     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1507     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1508 
1509     KokkosKernels::Impl::transpose_matrix<
1510       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1511       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1512       MatRowMapKokkosView,DefaultExecutionSpace>(
1513         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1514         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1515 
1516     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1517       We have to sort the indices, until KK provides finer control options.
1518     */
1519     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1520       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1521         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1522 
1523     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1524 
1525     /* Update U^T and do sptrsv symbolic */
1526     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1527     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1528     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1529     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1530 
1531     KokkosKernels::Impl::transpose_matrix<
1532       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1533       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1534       MatRowMapKokkosView,DefaultExecutionSpace>(
1535         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1536         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1537 
1538     /* Sort indices. See comments above */
1539     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1540       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1541         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1542 
1543     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1544     factors->transpose_updated = PETSC_TRUE;
1545   }
1546   PetscFunctionReturn(0);
1547 }
1548 
1549 /* Solve Ax = b, with A = LU */
1550 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1551 {
1552   ConstPetscScalarKokkosView     bv;
1553   PetscScalarKokkosView          xv;
1554   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1555 
1556   PetscFunctionBegin;
1557   PetscCall(PetscLogGpuTimeBegin());
1558   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1559   PetscCall(VecGetKokkosView(b,&bv));
1560   PetscCall(VecGetKokkosViewWrite(x,&xv));
1561   /* Solve L tmpv = b */
1562   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1563   /* Solve Ux = tmpv */
1564   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1565   PetscCall(VecRestoreKokkosView(b,&bv));
1566   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1567   PetscCall(PetscLogGpuTimeEnd());
1568   PetscFunctionReturn(0);
1569 }
1570 
1571 /* Solve A^T x = b, where A^T = U^T L^T */
1572 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1573 {
1574   ConstPetscScalarKokkosView     bv;
1575   PetscScalarKokkosView          xv;
1576   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1577 
1578   PetscFunctionBegin;
1579   PetscCall(PetscLogGpuTimeBegin());
1580   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1581   PetscCall(VecGetKokkosView(b,&bv));
1582   PetscCall(VecGetKokkosViewWrite(x,&xv));
1583   /* Solve U^T tmpv = b */
1584   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1585 
1586   /* Solve L^T x = tmpv */
1587   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1588   PetscCall(VecRestoreKokkosView(b,&bv));
1589   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1590   PetscCall(PetscLogGpuTimeEnd());
1591   PetscFunctionReturn(0);
1592 }
1593 
1594 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1595 {
1596   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1597   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1598   PetscInt                       fill_lev = info->levels;
1599 
1600   PetscFunctionBegin;
1601   PetscCall(PetscLogGpuTimeBegin());
1602   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1603 
1604   auto a_d = aijkok->a_dual.view_device();
1605   auto i_d = aijkok->i_dual.view_device();
1606   auto j_d = aijkok->j_dual.view_device();
1607 
1608   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1609 
1610   B->assembled                       = PETSC_TRUE;
1611   B->preallocated                    = PETSC_TRUE;
1612   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1613   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1614   B->ops->matsolve                   = NULL;
1615   B->ops->matsolvetranspose          = NULL;
1616   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1617 
1618   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1619   factors->transpose_updated         = PETSC_FALSE;
1620   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1621   /* TODO: log flops, but how to know that? */
1622   PetscCall(PetscLogGpuTimeEnd());
1623   PetscFunctionReturn(0);
1624 }
1625 
1626 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1627 {
1628   Mat_SeqAIJKokkos               *aijkok;
1629   Mat_SeqAIJ                     *b;
1630   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1631   PetscInt                       fill_lev = info->levels;
1632   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1633   PetscInt                       n = A->rmap->n;
1634 
1635   PetscFunctionBegin;
1636   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1637   /* Rebuild factors */
1638   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1639   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1640 
1641   /* Create a spiluk handle and then do symbolic factorization */
1642   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1643   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1644 
1645   auto spiluk_handle = factors->kh.get_spiluk_handle();
1646 
1647   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1648   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1649   Kokkos::realloc(factors->iU_d,n+1);
1650   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1651 
1652   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1653   auto i_d = aijkok->i_dual.view_device();
1654   auto j_d = aijkok->j_dual.view_device();
1655   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1656   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1657 
1658   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1659   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1660   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1661   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1662 
1663   /* TODO: add options to select sptrsv algorithms */
1664   /* Create sptrsv handles for L, U and their transpose */
1665  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1666   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1667  #else
1668   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1669  #endif
1670 
1671   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1672   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1673   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1674   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1675 
1676   /* Fill fields of the factor matrix B */
1677   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
1678   b     = (Mat_SeqAIJ*)B->data;
1679   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1680   B->info.fill_ratio_given  = info->fill;
1681   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1682 
1683   B->offloadmask            = PETSC_OFFLOAD_GPU;
1684   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1685   PetscFunctionReturn(0);
1686 }
1687 
1688 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1689 {
1690   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1691   const PetscInt   nrows   = A->rmap->n;
1692 
1693   PetscFunctionBegin;
1694   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1695   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1696   // move B data into Kokkos
1697   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1698   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1699   {
1700     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1701     if (!baijkok->diag_d.extent(0)) {
1702       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1703       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1704       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1705     }
1706   }
1707   PetscFunctionReturn(0);
1708 }
1709 
1710 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1711 {
1712   PetscFunctionBegin;
1713   *type = MATSOLVERKOKKOS;
1714   PetscFunctionReturn(0);
1715 }
1716 
1717 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1718 {
1719   PetscFunctionBegin;
1720   *type = MATSOLVERKOKKOSDEVICE;
1721   PetscFunctionReturn(0);
1722 }
1723 
1724 /*MC
1725   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1726   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1727 
1728   Level: beginner
1729 
1730 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1731 M*/
1732 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1733 {
1734   PetscInt       n = A->rmap->n;
1735 
1736   PetscFunctionBegin;
1737   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1738   PetscCall(MatSetSizes(*B,n,n,n,n));
1739   (*B)->factortype = ftype;
1740   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1741   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1742 
1743   if (ftype == MAT_FACTOR_LU) {
1744     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1745     (*B)->canuseordering         = PETSC_TRUE;
1746     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1747   } else if (ftype == MAT_FACTOR_ILU) {
1748     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1749     (*B)->canuseordering         = PETSC_FALSE;
1750     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1751   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1752 
1753   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1754   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1755   PetscFunctionReturn(0);
1756 }
1757 
1758 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1759 {
1760   PetscInt       n = A->rmap->n;
1761 
1762   PetscFunctionBegin;
1763   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1764   PetscCall(MatSetSizes(*B,n,n,n,n));
1765   (*B)->factortype = ftype;
1766   (*B)->canuseordering = PETSC_TRUE;
1767   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1768   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1769 
1770   if (ftype == MAT_FACTOR_LU) {
1771     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1772     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1773   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1774 
1775   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1776   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
1777   PetscFunctionReturn(0);
1778 }
1779 
1780 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1781 {
1782   PetscFunctionBegin;
1783   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
1784   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
1785   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
1786   PetscFunctionReturn(0);
1787 }
1788 
1789 /* Utility to print out a KokkosCsrMatrix for debugging */
1790 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1791 {
1792   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1793   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1794   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1795   const PetscInt    *i = iv.data();
1796   const PetscInt    *j = jv.data();
1797   const PetscScalar *a = av.data();
1798   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1799 
1800   PetscFunctionBegin;
1801   PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1802   for (PetscInt k=0; k<m; k++) {
1803     PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1804     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1805       PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p])));
1806     }
1807     PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n"));
1808   }
1809   PetscFunctionReturn(0);
1810 }
1811