xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 6ffe77eaecce1557e50d00ca5347a7f48e598865)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   Mat_SeqAIJ       *aijseq;
29   Mat_SeqAIJKokkos *aijkok;
30 
31   PetscFunctionBegin;
32   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
33   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
34 
35   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
36   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
37 
38   /* If aijkok does not exist, we just copy i, j to device.
39      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
40      In both cases, we build a new aijkok structure.
41   */
42   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
43     delete aijkok;
44     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
45     A->spptr = aijkok;
46   }
47 
48   if (aijkok->device_mat_d.data()) {
49     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
50   }
51   PetscFunctionReturn(0);
52 }
53 
54 /* Sync CSR data to device if not yet */
55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
56 {
57   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
58 
59   PetscFunctionBegin;
60   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
61   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
62   if (aijkok->a_dual.need_sync_device()) {
63     aijkok->a_dual.sync_device();
64     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
65     aijkok->hermitian_updated = PETSC_FALSE;
66   }
67   PetscFunctionReturn(0);
68 }
69 
70 /* Mark the CSR data on device as modified */
71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
72 {
73   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
74 
75   PetscFunctionBegin;
76   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
77   aijkok->a_dual.clear_sync_state();
78   aijkok->a_dual.modify_device();
79   aijkok->transpose_updated = PETSC_FALSE;
80   aijkok->hermitian_updated = PETSC_FALSE;
81   PetscCall(MatSeqAIJInvalidateDiagonal(A));
82   PetscCall(PetscObjectStateIncrease((PetscObject)A));
83   PetscFunctionReturn(0);
84 }
85 
86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
87 {
88   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
89 
90   PetscFunctionBegin;
91   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
92   /* We do not expect one needs factors on host  */
93   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
94   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
95   aijkok->a_dual.sync_host();
96   PetscFunctionReturn(0);
97 }
98 
99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
100 {
101   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
102 
103   PetscFunctionBegin;
104   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
105     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
106     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
107     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
108   */
109   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
110     aijkok->a_dual.sync_host();
111     *array = aijkok->a_dual.view_host().data();
112   } else { /* Happens when calling MatSetValues on a newly created matrix */
113     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
114   }
115   PetscFunctionReturn(0);
116 }
117 
118 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
119 {
120   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
121 
122   PetscFunctionBegin;
123   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
124   PetscFunctionReturn(0);
125 }
126 
127 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
128 {
129   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
130 
131   PetscFunctionBegin;
132   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
133     aijkok->a_dual.sync_host();
134     *array = aijkok->a_dual.view_host().data();
135   } else {
136     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
137   }
138   PetscFunctionReturn(0);
139 }
140 
141 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
142 {
143   PetscFunctionBegin;
144   *array = NULL;
145   PetscFunctionReturn(0);
146 }
147 
148 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
149 {
150   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
151 
152   PetscFunctionBegin;
153   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
154     *array = aijkok->a_dual.view_host().data();
155   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
156     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
157   }
158   PetscFunctionReturn(0);
159 }
160 
161 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
162 {
163   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
164 
165   PetscFunctionBegin;
166   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
167     aijkok->a_dual.clear_sync_state();
168     aijkok->a_dual.modify_host();
169   }
170   PetscFunctionReturn(0);
171 }
172 
173 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
174 {
175   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
176 
177   PetscFunctionBegin;
178   PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL");
179 
180   if (i) *i = aijkok->i_device_data();
181   if (j) *j = aijkok->j_device_data();
182   if (a) {
183     aijkok->a_dual.sync_device();
184     *a = aijkok->a_device_data();
185   }
186   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
187   PetscFunctionReturn(0);
188 }
189 
190 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
191 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
192 {
193   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
194   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
195 
196   PetscFunctionBegin;
197   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
198   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
199   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
200   PetscFunctionReturn(0);
201 }
202 
203 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
204 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
205 {
206   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
207 
208   PetscFunctionBegin;
209   if (aijkok && aijkok->device_mat_d.data()) {
210     *d_mat = aijkok->device_mat_d.data();
211   } else {
212     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
213     *d_mat  = NULL;
214   }
215   PetscFunctionReturn(0);
216 }
217 
218 /* Generate the transpose on device and cache it internally */
219 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
220 {
221   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
222 
223   PetscFunctionBegin;
224   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
225   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
226     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
227     PetscCallCXX(aijkok->a_dual.sync_device());
228     PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
229     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
230     aijkok->transpose_updated = PETSC_TRUE;
231   }
232   *csrmatT = &aijkok->csrmatT;
233   PetscFunctionReturn(0);
234 }
235 
236 /* Generate the Hermitian on device and cache it internally */
237 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
238 {
239   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
240 
241   PetscFunctionBegin;
242   PetscCall(PetscLogGpuTimeBegin());
243   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
244   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
245     PetscCallCXX(aijkok->a_dual.sync_device());
246     PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
247     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
248    #if defined(PETSC_USE_COMPLEX)
249     const auto& a = aijkok->csrmatH.values;
250     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
251    #endif
252     aijkok->hermitian_updated = PETSC_TRUE;
253   }
254   *csrmatH = &aijkok->csrmatH;
255   PetscCall(PetscLogGpuTimeEnd());
256   PetscFunctionReturn(0);
257 }
258 
259 /* y = A x */
260 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
261 {
262   Mat_SeqAIJKokkos                 *aijkok;
263   ConstPetscScalarKokkosView       xv;
264   PetscScalarKokkosView            yv;
265 
266   PetscFunctionBegin;
267   PetscCall(PetscLogGpuTimeBegin());
268   PetscCall(MatSeqAIJKokkosSyncDevice(A));
269   PetscCall(VecGetKokkosView(xx,&xv));
270   PetscCall(VecGetKokkosViewWrite(yy,&yv));
271   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
272   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
273   PetscCall(VecRestoreKokkosView(xx,&xv));
274   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
275   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
276   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
277   PetscCall(PetscLogGpuTimeEnd());
278   PetscFunctionReturn(0);
279 }
280 
281 /* y = A^T x */
282 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
283 {
284   Mat_SeqAIJKokkos                 *aijkok;
285   const char                       *mode;
286   ConstPetscScalarKokkosView       xv;
287   PetscScalarKokkosView            yv;
288   KokkosCsrMatrix                  *csrmat;
289 
290   PetscFunctionBegin;
291   PetscCall(PetscLogGpuTimeBegin());
292   PetscCall(MatSeqAIJKokkosSyncDevice(A));
293   PetscCall(VecGetKokkosView(xx,&xv));
294   PetscCall(VecGetKokkosViewWrite(yy,&yv));
295   if (A->form_explicit_transpose) {
296     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
297     mode = "N";
298   } else {
299     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
300     csrmat = &aijkok->csrmat;
301     mode = "T";
302   }
303   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
304   PetscCall(VecRestoreKokkosView(xx,&xv));
305   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
306   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
307   PetscCall(PetscLogGpuTimeEnd());
308   PetscFunctionReturn(0);
309 }
310 
311 /* y = A^H x */
312 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
313 {
314   Mat_SeqAIJKokkos                 *aijkok;
315   const char                       *mode;
316   ConstPetscScalarKokkosView       xv;
317   PetscScalarKokkosView            yv;
318   KokkosCsrMatrix                  *csrmat;
319 
320   PetscFunctionBegin;
321   PetscCall(PetscLogGpuTimeBegin());
322   PetscCall(MatSeqAIJKokkosSyncDevice(A));
323   PetscCall(VecGetKokkosView(xx,&xv));
324   PetscCall(VecGetKokkosViewWrite(yy,&yv));
325   if (A->form_explicit_transpose) {
326     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
327     mode = "N";
328   } else {
329     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
330     csrmat = &aijkok->csrmat;
331     mode = "C";
332   }
333   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
334   PetscCall(VecRestoreKokkosView(xx,&xv));
335   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
336   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
337   PetscCall(PetscLogGpuTimeEnd());
338   PetscFunctionReturn(0);
339 }
340 
341 /* z = A x + y */
342 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
343 {
344   Mat_SeqAIJKokkos                 *aijkok;
345   ConstPetscScalarKokkosView       xv,yv;
346   PetscScalarKokkosView            zv;
347 
348   PetscFunctionBegin;
349   PetscCall(PetscLogGpuTimeBegin());
350   PetscCall(MatSeqAIJKokkosSyncDevice(A));
351   PetscCall(VecGetKokkosView(xx,&xv));
352   PetscCall(VecGetKokkosView(yy,&yv));
353   PetscCall(VecGetKokkosViewWrite(zz,&zv));
354   if (zz != yy) Kokkos::deep_copy(zv,yv);
355   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
356   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
357   PetscCall(VecRestoreKokkosView(xx,&xv));
358   PetscCall(VecRestoreKokkosView(yy,&yv));
359   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
360   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
361   PetscCall(PetscLogGpuTimeEnd());
362   PetscFunctionReturn(0);
363 }
364 
365 /* z = A^T x + y */
366 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
367 {
368   Mat_SeqAIJKokkos                 *aijkok;
369   const char                       *mode;
370   ConstPetscScalarKokkosView       xv,yv;
371   PetscScalarKokkosView            zv;
372   KokkosCsrMatrix                  *csrmat;
373 
374   PetscFunctionBegin;
375   PetscCall(PetscLogGpuTimeBegin());
376   PetscCall(MatSeqAIJKokkosSyncDevice(A));
377   PetscCall(VecGetKokkosView(xx,&xv));
378   PetscCall(VecGetKokkosView(yy,&yv));
379   PetscCall(VecGetKokkosViewWrite(zz,&zv));
380   if (zz != yy) Kokkos::deep_copy(zv,yv);
381   if (A->form_explicit_transpose) {
382     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
383     mode = "N";
384   } else {
385     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
386     csrmat = &aijkok->csrmat;
387     mode = "T";
388   }
389   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
390   PetscCall(VecRestoreKokkosView(xx,&xv));
391   PetscCall(VecRestoreKokkosView(yy,&yv));
392   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
393   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
394   PetscCall(PetscLogGpuTimeEnd());
395   PetscFunctionReturn(0);
396 }
397 
398 /* z = A^H x + y */
399 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
400 {
401   Mat_SeqAIJKokkos                 *aijkok;
402   const char                       *mode;
403   ConstPetscScalarKokkosView       xv,yv;
404   PetscScalarKokkosView            zv;
405   KokkosCsrMatrix                  *csrmat;
406 
407   PetscFunctionBegin;
408   PetscCall(PetscLogGpuTimeBegin());
409   PetscCall(MatSeqAIJKokkosSyncDevice(A));
410   PetscCall(VecGetKokkosView(xx,&xv));
411   PetscCall(VecGetKokkosView(yy,&yv));
412   PetscCall(VecGetKokkosViewWrite(zz,&zv));
413   if (zz != yy) Kokkos::deep_copy(zv,yv);
414   if (A->form_explicit_transpose) {
415     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
416     mode = "N";
417   } else {
418     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
419     csrmat = &aijkok->csrmat;
420     mode = "C";
421   }
422   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
423   PetscCall(VecRestoreKokkosView(xx,&xv));
424   PetscCall(VecRestoreKokkosView(yy,&yv));
425   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
426   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
427   PetscCall(PetscLogGpuTimeEnd());
428   PetscFunctionReturn(0);
429 }
430 
431 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
432 {
433   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
434 
435   PetscFunctionBegin;
436   switch (op) {
437     case MAT_FORM_EXPLICIT_TRANSPOSE:
438       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
439       if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
440       A->form_explicit_transpose = flg;
441       break;
442     default:
443       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
444       break;
445   }
446   PetscFunctionReturn(0);
447 }
448 
449 /* Depending on reuse, either build a new mat, or use the existing mat */
450 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
451 {
452   Mat_SeqAIJ       *aseq;
453 
454   PetscFunctionBegin;
455   PetscCall(PetscKokkosInitializeCheck());
456   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
457     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
458   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
459     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
460   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
461     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
462     PetscCall(PetscFree(A->defaultvectype));
463     PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
464     PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
465     PetscCall(MatSetOps_SeqAIJKokkos(A));
466     aseq = static_cast<Mat_SeqAIJ*>(A->data);
467     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
468       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
469       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
470     }
471   }
472   PetscFunctionReturn(0);
473 }
474 
475 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
476    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
477  */
478 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
479 {
480   Mat_SeqAIJ            *bseq;
481   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
482   Mat                   mat;
483 
484   PetscFunctionBegin;
485   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
486   PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
487   mat  = *B;
488   if (A->assembled) {
489     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
490     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
491     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
492     /* Now copy values to B if needed */
493     if (dupOption == MAT_COPY_VALUES) {
494       if (akok->a_dual.need_sync_device()) {
495         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
496         bkok->a_dual.modify_host();
497       } else { /* If device has the latest data, we only copy data on device */
498         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
499         bkok->a_dual.modify_device();
500       }
501     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
502       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
503       bkok->a_dual.modify_host();
504     }
505     mat->spptr = bkok;
506   }
507 
508   PetscCall(PetscFree(mat->defaultvectype));
509   PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
510   PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
511   PetscCall(MatSetOps_SeqAIJKokkos(mat));
512   PetscFunctionReturn(0);
513 }
514 
515 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
516 {
517   Mat               At;
518   KokkosCsrMatrix   *internT;
519   Mat_SeqAIJKokkos  *atkok,*bkok;
520 
521   PetscFunctionBegin;
522   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
523   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
524     /* Deep copy internT, as we want to isolate the internal transpose */
525     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
526     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
527     if (reuse == MAT_INITIAL_MATRIX) *B = At;
528     else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
529   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
530     if ((*B)->assembled) {
531       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
532       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
533       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
534     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
535       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
536       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
537       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
538       PetscCallCXX(Kokkos::deep_copy(a_h,internT->values));
539       PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
540     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
541   }
542   PetscFunctionReturn(0);
543 }
544 
545 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
546 {
547   Mat_SeqAIJKokkos           *aijkok;
548 
549   PetscFunctionBegin;
550   if (A->factortype == MAT_FACTOR_NONE) {
551     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
552     delete aijkok;
553   } else {
554     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
555   }
556   A->spptr = NULL;
557   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
558   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
559   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
560   PetscCall(MatDestroy_SeqAIJ(A));
561   PetscFunctionReturn(0);
562 }
563 
564 /*MC
565    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
566 
567    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
568 
569    Options Database Keys:
570 .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
571 
572   Level: beginner
573 
574 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
575 M*/
576 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
577 {
578   PetscFunctionBegin;
579   PetscCall(PetscKokkosInitializeCheck());
580   PetscCall(MatCreate_SeqAIJ(A));
581   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
582   PetscFunctionReturn(0);
583 }
584 
585 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
586 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
587 {
588   Mat_SeqAIJ                   *a,*b;
589   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
590   MatScalarKokkosView          aa,ba,ca;
591   MatRowMapKokkosView          ai,bi,ci;
592   MatColIdxKokkosView          aj,bj,cj;
593   PetscInt                     m,n,nnz,aN;
594 
595   PetscFunctionBegin;
596   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
597   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
598   PetscValidPointer(C,4);
599   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
600   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
601   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
602   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
603 
604   PetscCall(MatSeqAIJKokkosSyncDevice(A));
605   PetscCall(MatSeqAIJKokkosSyncDevice(B));
606   a    = static_cast<Mat_SeqAIJ*>(A->data);
607   b    = static_cast<Mat_SeqAIJ*>(B->data);
608   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
609   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
610   aa   = akok->a_dual.view_device();
611   ai   = akok->i_dual.view_device();
612   ba   = bkok->a_dual.view_device();
613   bi   = bkok->i_dual.view_device();
614   m    = A->rmap->n; /* M, N and nnz of C */
615   n    = A->cmap->n + B->cmap->n;
616   nnz  = a->nz + b->nz;
617   aN   = A->cmap->n; /* N of A */
618   if (reuse == MAT_INITIAL_MATRIX) {
619     aj = akok->j_dual.view_device();
620     bj = bkok->j_dual.view_device();
621     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
622     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
623     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
624     ca = ca_dual.view_device();
625     ci = ci_dual.view_device();
626     cj = cj_dual.view_device();
627 
628     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
629     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
630       PetscInt i = t.league_rank(); /* row i */
631       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
632 
633       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
634         ci(i) = coffset;
635         if (i == m-1) ci(m) = ai(m) + bi(m);
636       });
637 
638       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
639         if (k < alen) {
640           ca(coffset+k) = aa(ai(i)+k);
641           cj(coffset+k) = aj(ai(i)+k);
642         } else {
643           ca(coffset+k) = ba(bi(i)+k-alen);
644           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
645         }
646       });
647     });
648     ca_dual.modify_device();
649     ci_dual.modify_device();
650     cj_dual.modify_device();
651     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
652     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
653   } else if (reuse == MAT_REUSE_MATRIX) {
654     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
655     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
656     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
657     ca   = ckok->a_dual.view_device();
658     ci   = ckok->i_dual.view_device();
659 
660     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
661       PetscInt i = t.league_rank(); /* row i */
662       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
663       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
664         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
665         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
666       });
667     });
668     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
669   }
670   PetscFunctionReturn(0);
671 }
672 
673 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
674 {
675   PetscFunctionBegin;
676   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
677   PetscFunctionReturn(0);
678 }
679 
680 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
681 {
682   Mat_Product                    *product = C->product;
683   Mat                            A,B;
684   bool                           transA,transB; /* use bool, since KK needs this type */
685   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
686   Mat_SeqAIJ                     *c;
687   MatProductData_SeqAIJKokkos    *pdata;
688   KokkosCsrMatrix                *csrmatA,*csrmatB;
689 
690   PetscFunctionBegin;
691   MatCheckProduct(C,1);
692   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
693   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
694 
695   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
696     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
697     PetscFunctionReturn(0);
698   }
699 
700   switch (product->type) {
701     case MATPRODUCT_AB:  transA = false; transB = false; break;
702     case MATPRODUCT_AtB: transA = true;  transB = false; break;
703     case MATPRODUCT_ABt: transA = false; transB = true;  break;
704     default:
705       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
706   }
707 
708   A     = product->A;
709   B     = product->B;
710   PetscCall(MatSeqAIJKokkosSyncDevice(A));
711   PetscCall(MatSeqAIJKokkosSyncDevice(B));
712   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
713   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
714   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
715 
716   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
717 
718   csrmatA = &akok->csrmat;
719   csrmatB = &bkok->csrmat;
720 
721   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
722   if (transA) {
723     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
724     transA = false;
725   }
726 
727   if (transB) {
728     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
729     transB = false;
730   }
731   PetscCall(PetscLogGpuTimeBegin());
732   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
733   PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
734   PetscCall(PetscLogGpuTimeEnd());
735   PetscCall(MatSeqAIJKokkosModifyDevice(C));
736   /* shorter version of MatAssemblyEnd_SeqAIJ */
737   c = (Mat_SeqAIJ*)C->data;
738   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
739   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
740   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
741   c->reallocs         = 0;
742   C->info.mallocs     = 0;
743   C->info.nz_unneeded = 0;
744   C->assembled        = C->was_assembled = PETSC_TRUE;
745   C->num_ass++;
746   PetscFunctionReturn(0);
747 }
748 
749 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
750 {
751   Mat_Product                    *product = C->product;
752   MatProductType                 ptype;
753   Mat                            A,B;
754   bool                           transA,transB;
755   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
756   MatProductData_SeqAIJKokkos    *pdata;
757   MPI_Comm                       comm;
758   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
759 
760   PetscFunctionBegin;
761   MatCheckProduct(C,1);
762   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
763   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
764   A       = product->A;
765   B       = product->B;
766   PetscCall(MatSeqAIJKokkosSyncDevice(A));
767   PetscCall(MatSeqAIJKokkosSyncDevice(B));
768   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
769   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
770   csrmatA = &akok->csrmat;
771   csrmatB = &bkok->csrmat;
772 
773   ptype   = product->type;
774   switch (ptype) {
775     case MATPRODUCT_AB:  transA = false; transB = false; break;
776     case MATPRODUCT_AtB: transA = true;  transB = false; break;
777     case MATPRODUCT_ABt: transA = false; transB = true;  break;
778     default:
779       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
780   }
781 
782   product->data = pdata = new MatProductData_SeqAIJKokkos();
783   pdata->kh.set_team_work_size(16);
784   pdata->kh.set_dynamic_scheduling(true);
785   pdata->reusesym = product->api_user;
786 
787   /* TODO: add command line options to select spgemm algorithms */
788   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
789   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
790      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
791    */
792   pdata->kh.create_spgemm_handle(spgemm_alg);
793 
794   PetscCall(PetscLogGpuTimeBegin());
795   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
796   if (transA) {
797     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
798     transA = false;
799   }
800 
801   if (transB) {
802     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
803     transB = false;
804   }
805 
806   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
807   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
808     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
809     calling new Mat_SeqAIJKokkos().
810     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
811   */
812   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
813   PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC));
814   PetscCall(PetscLogGpuTimeEnd());
815 
816   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
817   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
818   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
819   PetscFunctionReturn(0);
820 }
821 
822 /* handles sparse matrix matrix ops */
823 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
824 {
825   Mat_Product    *product = mat->product;
826   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
827 
828   PetscFunctionBegin;
829   MatCheckProduct(mat,1);
830   PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
831   if (product->type == MATPRODUCT_ABC) {
832     PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
833   }
834   if (Biskok && Ciskok) {
835     switch (product->type) {
836     case MATPRODUCT_AB:
837     case MATPRODUCT_AtB:
838     case MATPRODUCT_ABt:
839       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
840       break;
841     case MATPRODUCT_PtAP:
842     case MATPRODUCT_RARt:
843     case MATPRODUCT_ABC:
844       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
845       break;
846     default:
847       break;
848     }
849   } else { /* fallback for AIJ */
850     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
851   }
852   PetscFunctionReturn(0);
853 }
854 
855 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
856 {
857   Mat_SeqAIJKokkos *aijkok;
858 
859   PetscFunctionBegin;
860   PetscCall(PetscLogGpuTimeBegin());
861   PetscCall(MatSeqAIJKokkosSyncDevice(A));
862   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
863   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
864   PetscCall(MatSeqAIJKokkosModifyDevice(A));
865   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
866   PetscCall(PetscLogGpuTimeEnd());
867   PetscFunctionReturn(0);
868 }
869 
870 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
871 {
872   Mat_SeqAIJKokkos *aijkok;
873 
874   PetscFunctionBegin;
875   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
876   if (aijkok) { /* Only zero the device if data is already there */
877     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
878     PetscCall(MatSeqAIJKokkosModifyDevice(A));
879   } else { /* Might be preallocated but not assembled */
880     PetscCall(MatZeroEntries_SeqAIJ(A));
881   }
882   PetscFunctionReturn(0);
883 }
884 
885 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
886 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
887 {
888   Mat_SeqAIJKokkos   *aijkok;
889 
890   PetscFunctionBegin;
891   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
892   PetscValidPointer(kv,2);
893   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
894   PetscCall(MatSeqAIJKokkosSyncDevice(A));
895   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
896   *kv    = aijkok->a_dual.view_device();
897   PetscFunctionReturn(0);
898 }
899 
900 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
901 {
902   PetscFunctionBegin;
903   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
904   PetscValidPointer(kv,2);
905   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
906   PetscFunctionReturn(0);
907 }
908 
909 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
910 {
911   Mat_SeqAIJKokkos   *aijkok;
912 
913   PetscFunctionBegin;
914   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
915   PetscValidPointer(kv,2);
916   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
917   PetscCall(MatSeqAIJKokkosSyncDevice(A));
918   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
919   *kv    = aijkok->a_dual.view_device();
920   PetscFunctionReturn(0);
921 }
922 
923 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
924 {
925   PetscFunctionBegin;
926   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
927   PetscValidPointer(kv,2);
928   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
929   PetscCall(MatSeqAIJKokkosModifyDevice(A));
930   PetscFunctionReturn(0);
931 }
932 
933 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
934 {
935   Mat_SeqAIJKokkos   *aijkok;
936 
937   PetscFunctionBegin;
938   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
939   PetscValidPointer(kv,2);
940   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
941   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
942   *kv    = aijkok->a_dual.view_device();
943   PetscFunctionReturn(0);
944 }
945 
946 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
947 {
948   PetscFunctionBegin;
949   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
950   PetscValidPointer(kv,2);
951   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
952   PetscCall(MatSeqAIJKokkosModifyDevice(A));
953   PetscFunctionReturn(0);
954 }
955 
956 /* Computes Y += alpha X */
957 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
958 {
959   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
960   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
961   ConstMatScalarKokkosView   Xa;
962   MatScalarKokkosView        Ya;
963 
964   PetscFunctionBegin;
965   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
966   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
967   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
968   PetscCall(MatSeqAIJKokkosSyncDevice(X));
969   PetscCall(PetscLogGpuTimeBegin());
970 
971   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
972     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
973     PetscBool e;
974     PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
975     if (e) {
976       PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e));
977       if (e) pattern = SAME_NONZERO_PATTERN;
978     }
979   }
980 
981   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
982     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
983     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
984     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
985   */
986   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
987   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
988   Xa   = xkok->a_dual.view_device();
989   Ya   = ykok->a_dual.view_device();
990 
991   if (pattern == SAME_NONZERO_PATTERN) {
992     KokkosBlas::axpy(alpha,Xa,Ya);
993     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
994   } else if (pattern == SUBSET_NONZERO_PATTERN) {
995     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
996     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
997 
998     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
999       PetscInt i = t.league_rank(); /* row i */
1000       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1001         PetscInt p,q = Yi(i);
1002         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1003           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1004           if (Xj(p) == Yj(q)) { /* Found it */
1005             Ya(q) += alpha * Xa(p);
1006             q++;
1007           } else {
1008             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1009                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1010             */
1011             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1012           }
1013         }
1014       });
1015     });
1016     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1017   } else { /* different nonzero patterns */
1018     Mat             Z;
1019     KokkosCsrMatrix zcsr;
1020     KernelHandle    kh;
1021     kh.create_spadd_handle(false);
1022     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1023     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1024     zkok = new Mat_SeqAIJKokkos(zcsr);
1025     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
1026     PetscCall(MatHeaderReplace(Y,&Z));
1027     kh.destroy_spadd_handle();
1028   }
1029   PetscCall(PetscLogGpuTimeEnd());
1030   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1031   PetscFunctionReturn(0);
1032 }
1033 
1034 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1035 {
1036   Mat_SeqAIJKokkos *akok;
1037   Mat_SeqAIJ       *aseq;
1038 
1039   PetscFunctionBegin;
1040   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1041   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1042   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1043   delete akok;
1044   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
1045   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1046   akok->SetUpCOO(aseq);
1047   PetscFunctionReturn(0);
1048 }
1049 
1050 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1051 {
1052   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1053   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1054   PetscCount                  Annz = aseq->nz;
1055   const PetscCountKokkosView& jmap = akok->jmap_d;
1056   const PetscCountKokkosView& perm = akok->perm_d;
1057   MatScalarKokkosView         Aa;
1058   ConstMatScalarKokkosView    kv;
1059   PetscMemType                memtype;
1060 
1061   PetscFunctionBegin;
1062   PetscCall(PetscGetMemType(v,&memtype));
1063   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1064     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1065   } else {
1066     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1067   }
1068 
1069   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1070   else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
1071 
1072   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1073     PetscScalar sum = 0.0;
1074     for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k));
1075     Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum;
1076   });
1077 
1078   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1079   else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa));
1080   PetscFunctionReturn(0);
1081 }
1082 
1083 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag)
1084 {
1085   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1086   MatScalarKokkosView         Aa;
1087   const MatRowMapKokkosView&  Ai = akok->i_dual.view_device();
1088   PetscInt                    m = A->rmap->n;
1089   ConstMatRowMapKokkosView    Adiag(diag,m); /* diag is a device pointer */
1090 
1091   PetscFunctionBegin;
1092   PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa));
1093   Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) {
1094     PetscScalar tmp;
1095     if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */
1096       tmp          = Aa(Ai(i));
1097       Aa(Ai(i))    = Aa(Adiag(i));
1098       Aa(Adiag(i)) = tmp;
1099     }
1100   });
1101   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1102   PetscFunctionReturn(0);
1103 }
1104 
1105 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1106 {
1107   PetscFunctionBegin;
1108   PetscCall(MatSeqAIJKokkosSyncHost(A));
1109   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
1110   B->offloadmask = PETSC_OFFLOAD_CPU;
1111   PetscFunctionReturn(0);
1112 }
1113 
1114 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1115 {
1116   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1117 
1118   PetscFunctionBegin;
1119   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1120   A->boundtocpu  = PETSC_FALSE;
1121 
1122   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1123   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1124   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1125   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1126   A->ops->scale                     = MatScale_SeqAIJKokkos;
1127   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1128   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1129   A->ops->mult                      = MatMult_SeqAIJKokkos;
1130   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1131   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1132   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1133   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1134   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1135   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1136   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1137   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1138   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1139   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1140   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1141   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1142   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1143   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1144   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1145 
1146   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
1147   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1148   PetscFunctionReturn(0);
1149 }
1150 
1151 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1152 {
1153   Mat_SeqAIJ *aseq;
1154   PetscInt    i,m,n;
1155 
1156   PetscFunctionBegin;
1157   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1158 
1159   m = akok->nrows();
1160   n = akok->ncols();
1161   PetscCall(MatSetSizes(A,m,n,m,n));
1162   PetscCall(MatSetType(A,MATSEQAIJKOKKOS));
1163 
1164   /* Set up data structures of A as a MATSEQAIJ */
1165   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1166   aseq = (Mat_SeqAIJ*)(A)->data;
1167 
1168   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1169   akok->j_dual.sync_host();
1170 
1171   aseq->i            = akok->i_host_data();
1172   aseq->j            = akok->j_host_data();
1173   aseq->a            = akok->a_host_data();
1174   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1175   aseq->singlemalloc = PETSC_FALSE;
1176   aseq->free_a       = PETSC_FALSE;
1177   aseq->free_ij      = PETSC_FALSE;
1178   aseq->nz           = akok->nnz();
1179   aseq->maxnz        = aseq->nz;
1180 
1181   PetscCall(PetscMalloc1(m,&aseq->imax));
1182   PetscCall(PetscMalloc1(m,&aseq->ilen));
1183   for (i=0; i<m; i++) {
1184     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1185   }
1186 
1187   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1188   akok->nonzerostate = A->nonzerostate;
1189   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1190   PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
1191   PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1192   PetscFunctionReturn(0);
1193 }
1194 
1195 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1196 
1197    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1198  */
1199 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1200 {
1201   PetscFunctionBegin;
1202   PetscCall(MatCreate(comm,A));
1203   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
1204   PetscFunctionReturn(0);
1205 }
1206 
1207 /* --------------------------------------------------------------------------------*/
1208 /*@C
1209    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1210    (the default parallel PETSc format). This matrix will ultimately be handled by
1211    Kokkos for calculations. For good matrix
1212    assembly performance the user should preallocate the matrix storage by setting
1213    the parameter nz (or the array nnz).  By setting these parameters accurately,
1214    performance during matrix assembly can be increased by more than a factor of 50.
1215 
1216    Collective
1217 
1218    Input Parameters:
1219 +  comm - MPI communicator, set to PETSC_COMM_SELF
1220 .  m - number of rows
1221 .  n - number of columns
1222 .  nz - number of nonzeros per row (same for all rows)
1223 -  nnz - array containing the number of nonzeros in the various rows
1224          (possibly different for each row) or NULL
1225 
1226    Output Parameter:
1227 .  A - the matrix
1228 
1229    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1230    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1231    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1232 
1233    Notes:
1234    If nnz is given then nz is ignored
1235 
1236    The AIJ format (also called the Yale sparse matrix format or
1237    compressed row storage), is fully compatible with standard Fortran 77
1238    storage.  That is, the stored row and column indices can begin at
1239    either one (as in Fortran) or zero.  See the users' manual for details.
1240 
1241    Specify the preallocated storage with either nz or nnz (not both).
1242    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1243    allocation.  For large problems you MUST preallocate memory or you
1244    will get TERRIBLE performance, see the users' manual chapter on matrices.
1245 
1246    By default, this format uses inodes (identical nodes) when possible, to
1247    improve numerical efficiency of matrix-vector products and solves. We
1248    search for consecutive rows with the same nonzero structure, thereby
1249    reusing matrix information to achieve increased efficiency.
1250 
1251    Level: intermediate
1252 
1253 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1254 @*/
1255 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1256 {
1257   PetscFunctionBegin;
1258   PetscCall(PetscKokkosInitializeCheck());
1259   PetscCall(MatCreate(comm,A));
1260   PetscCall(MatSetSizes(*A,m,n,m,n));
1261   PetscCall(MatSetType(*A,MATSEQAIJKOKKOS));
1262   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
1263   PetscFunctionReturn(0);
1264 }
1265 
1266 typedef Kokkos::TeamPolicy<>::member_type team_member;
1267 //
1268 // This factorization exploits block diagonal matrices with "Nf" (not used).
1269 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1270 //
1271 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1272 {
1273   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1274   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1275   IS                 isrow = b->row,isicol = b->icol;
1276   const PetscInt     *r_h,*ic_h;
1277   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1278   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1279   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1280   PetscBool          row_identity,col_identity;
1281   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1282 
1283   PetscFunctionBegin;
1284   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1285   PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity));
1286   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1287   PetscCall(ISGetIndices(isrow,&r_h));
1288   PetscCall(ISGetIndices(isicol,&ic_h));
1289   PetscCall(ISGetSize(isicol,&nc));
1290   PetscCall(PetscLogGpuTimeBegin());
1291   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1292   {
1293 #define KOKKOS_SHARED_LEVEL 1
1294     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1295     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1296     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1297     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1298     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1299     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1300     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1301     size_t flops_h = 0.0;
1302     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1303     Kokkos::View<size_t> d_flops_k ("flops");
1304     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1305     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1306     Kokkos::deep_copy (d_flops_k, h_flops_k);
1307     Kokkos::deep_copy (d_r_k, h_r_k);
1308     Kokkos::deep_copy (d_ic_k, h_ic_k);
1309     // Fill A --> fact
1310     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1311         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1312         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1313         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1314         // zero rows of B
1315         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1316             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1317             PetscScalar *baL = ba_d + bi_d[rowb];
1318             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1319             /* zero (unfactored row) */
1320             for (int j=0;j<nzbL;j++) baL[j] = 0;
1321             for (int j=0;j<nzbU;j++) baU[j] = 0;
1322           });
1323         // copy A into B
1324         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1325             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1326             const PetscScalar *av    = aa_d + ai_d[rowa];
1327             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1328             /* load in initial (unfactored row) */
1329             for (int j=0;j<nza;j++) {
1330               PetscInt    colb = ic[ajtmp[j]];
1331               PetscScalar vala = av[j];
1332               if (colb == rowb) {
1333                 *(ba_d + bdiag_d[rowb]) = vala;
1334               } else {
1335                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1336                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1337                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1338                 for (int j=0; j<nz ; j++) {
1339                   if (pbj[j] == colb) {
1340                     pba[j] = vala;
1341                     set++;
1342                     break;
1343                   }
1344                 }
1345                #if !defined(PETSC_HAVE_SYCL)
1346                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1347                #endif
1348               }
1349             }
1350           });
1351       });
1352     Kokkos::fence();
1353 
1354     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1355         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1356         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1357         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1358         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1359         const PetscInt  start = field*nloc, end = start + nloc;
1360         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1361         // A22 panel update for each row A(1,:) and col A(:,1)
1362         for (int ii=start; ii<end-1; ii++) {
1363           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1364           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1365           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1366           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1367           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1368               PetscInt kIdx = j*Ni + field_block_idx;
1369               if (kIdx >= nzUi) /* void */ ;
1370               else {
1371                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1372                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1373                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1374                 size_t         st_idx;
1375                 // find and do L(k,i) = A(:k,i) / A(i,i)
1376                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1377                 // get column, there has got to be a better way
1378                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1379                     if (pjL[j] == ii) {
1380                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1381                       idx = j; // output
1382                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1383                     }
1384                 }, st_idx);
1385                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1386 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1387                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1388 #endif
1389                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1390                 // U(i+1,:end)
1391                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1392                       PetscScalar Uij = baUi[uiIdx];
1393                       PetscInt    col = bjUi[uiIdx];
1394                       if (col==myk) {
1395                         // A_kk = A_kk - L_ki * U_ij(k)
1396                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1397                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1398                       } else {
1399                         PetscScalar    *start, *end, *pAkjv=NULL;
1400                         PetscInt       high, low;
1401                         const PetscInt *startj;
1402                         if (col<myk) { // L
1403                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1404                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1405                           start = pLki+1; // start at pLki+1, A22(myk,1)
1406                           startj= bj_d + bi_d[myk] + idx;
1407                           end   = ba_d + bi_d[myk+1];
1408                         } else {
1409                           PetscInt idx = bdiag_d[myk+1]+1;
1410                           start = ba_d + idx;
1411                           startj= bj_d + idx;
1412                           end   = ba_d + bdiag_d[myk];
1413                         }
1414                         // search for 'col', use bisection search - TODO
1415                         low  = 0;
1416                         high = (PetscInt)(end-start);
1417                         while (high-low > 5) {
1418                           int t = (low+high)/2;
1419                           if (startj[t] > col) high = t;
1420                           else                 low  = t;
1421                         }
1422                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1423                           if (startj[pAkjv-start] == col) break;
1424                         }
1425 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1426                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1427 #endif
1428                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1429                       }
1430                     });
1431               }
1432             });
1433           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1434           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1435         } /* endof for (i=0; i<n; i++) { */
1436         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1437       });
1438     Kokkos::fence();
1439     Kokkos::deep_copy (h_flops_k, d_flops_k);
1440     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1441     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1442         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1443         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1444         /* Invert diagonal for simpler triangular solves */
1445         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1446             int i = start + outer_index*Ni + lg_rank%Ni;
1447             if (i < end) {
1448               PetscScalar *pv = ba_d + bdiag_d[i];
1449               *pv = 1.0/(*pv);
1450             }
1451           });
1452       });
1453   }
1454   PetscCall(PetscLogGpuTimeEnd());
1455   PetscCall(ISRestoreIndices(isicol,&ic_h));
1456   PetscCall(ISRestoreIndices(isrow,&r_h));
1457 
1458   PetscCall(ISIdentity(isrow,&row_identity));
1459   PetscCall(ISIdentity(isicol,&col_identity));
1460   if (b->inode.size) {
1461     B->ops->solve = MatSolve_SeqAIJ_Inode;
1462   } else if (row_identity && col_identity) {
1463     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1464   } else {
1465     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1466   }
1467   B->offloadmask = PETSC_OFFLOAD_GPU;
1468   PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
1469   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1470   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1471   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1472   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1473   B->assembled              = PETSC_TRUE;
1474   B->preallocated           = PETSC_TRUE;
1475 
1476   PetscFunctionReturn(0);
1477 }
1478 
1479 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1480 {
1481   PetscFunctionBegin;
1482   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1483   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1484   PetscFunctionReturn(0);
1485 }
1486 
1487 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1488 {
1489   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1490 
1491   PetscFunctionBegin;
1492   if (!factors->sptrsv_symbolic_completed) {
1493     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1494     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1495     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1496   }
1497   PetscFunctionReturn(0);
1498 }
1499 
1500 /* Check if we need to update factors etc for transpose solve */
1501 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1502 {
1503   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1504   MatColIdxType             n = A->rmap->n;
1505 
1506   PetscFunctionBegin;
1507   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1508     /* Update L^T and do sptrsv symbolic */
1509     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1510     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1511     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1512     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1513 
1514     KokkosKernels::Impl::transpose_matrix<
1515       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1516       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1517       MatRowMapKokkosView,DefaultExecutionSpace>(
1518         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1519         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1520 
1521     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1522       We have to sort the indices, until KK provides finer control options.
1523     */
1524     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1525       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1526         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1527 
1528     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1529 
1530     /* Update U^T and do sptrsv symbolic */
1531     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1532     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1533     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1534     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1535 
1536     KokkosKernels::Impl::transpose_matrix<
1537       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1538       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1539       MatRowMapKokkosView,DefaultExecutionSpace>(
1540         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1541         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1542 
1543     /* Sort indices. See comments above */
1544     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1545       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1546         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1547 
1548     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1549     factors->transpose_updated = PETSC_TRUE;
1550   }
1551   PetscFunctionReturn(0);
1552 }
1553 
1554 /* Solve Ax = b, with A = LU */
1555 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1556 {
1557   ConstPetscScalarKokkosView     bv;
1558   PetscScalarKokkosView          xv;
1559   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1560 
1561   PetscFunctionBegin;
1562   PetscCall(PetscLogGpuTimeBegin());
1563   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1564   PetscCall(VecGetKokkosView(b,&bv));
1565   PetscCall(VecGetKokkosViewWrite(x,&xv));
1566   /* Solve L tmpv = b */
1567   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1568   /* Solve Ux = tmpv */
1569   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1570   PetscCall(VecRestoreKokkosView(b,&bv));
1571   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1572   PetscCall(PetscLogGpuTimeEnd());
1573   PetscFunctionReturn(0);
1574 }
1575 
1576 /* Solve A^T x = b, where A^T = U^T L^T */
1577 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1578 {
1579   ConstPetscScalarKokkosView     bv;
1580   PetscScalarKokkosView          xv;
1581   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1582 
1583   PetscFunctionBegin;
1584   PetscCall(PetscLogGpuTimeBegin());
1585   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1586   PetscCall(VecGetKokkosView(b,&bv));
1587   PetscCall(VecGetKokkosViewWrite(x,&xv));
1588   /* Solve U^T tmpv = b */
1589   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1590 
1591   /* Solve L^T x = tmpv */
1592   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1593   PetscCall(VecRestoreKokkosView(b,&bv));
1594   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1595   PetscCall(PetscLogGpuTimeEnd());
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1600 {
1601   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1602   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1603   PetscInt                       fill_lev = info->levels;
1604 
1605   PetscFunctionBegin;
1606   PetscCall(PetscLogGpuTimeBegin());
1607   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1608 
1609   auto a_d = aijkok->a_dual.view_device();
1610   auto i_d = aijkok->i_dual.view_device();
1611   auto j_d = aijkok->j_dual.view_device();
1612 
1613   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1614 
1615   B->assembled                       = PETSC_TRUE;
1616   B->preallocated                    = PETSC_TRUE;
1617   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1618   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1619   B->ops->matsolve                   = NULL;
1620   B->ops->matsolvetranspose          = NULL;
1621   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1622 
1623   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1624   factors->transpose_updated         = PETSC_FALSE;
1625   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1626   /* TODO: log flops, but how to know that? */
1627   PetscCall(PetscLogGpuTimeEnd());
1628   PetscFunctionReturn(0);
1629 }
1630 
1631 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1632 {
1633   Mat_SeqAIJKokkos               *aijkok;
1634   Mat_SeqAIJ                     *b;
1635   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1636   PetscInt                       fill_lev = info->levels;
1637   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1638   PetscInt                       n = A->rmap->n;
1639 
1640   PetscFunctionBegin;
1641   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1642   /* Rebuild factors */
1643   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1644   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1645 
1646   /* Create a spiluk handle and then do symbolic factorization */
1647   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1648   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1649 
1650   auto spiluk_handle = factors->kh.get_spiluk_handle();
1651 
1652   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1653   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1654   Kokkos::realloc(factors->iU_d,n+1);
1655   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1656 
1657   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1658   auto i_d = aijkok->i_dual.view_device();
1659   auto j_d = aijkok->j_dual.view_device();
1660   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1661   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1662 
1663   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1664   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1665   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1666   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1667 
1668   /* TODO: add options to select sptrsv algorithms */
1669   /* Create sptrsv handles for L, U and their transpose */
1670  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1671   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1672  #else
1673   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1674  #endif
1675 
1676   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1677   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1678   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1679   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1680 
1681   /* Fill fields of the factor matrix B */
1682   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
1683   b     = (Mat_SeqAIJ*)B->data;
1684   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1685   B->info.fill_ratio_given  = info->fill;
1686   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1687 
1688   B->offloadmask            = PETSC_OFFLOAD_GPU;
1689   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1690   PetscFunctionReturn(0);
1691 }
1692 
1693 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1694 {
1695   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1696   const PetscInt   nrows   = A->rmap->n;
1697 
1698   PetscFunctionBegin;
1699   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1700   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1701   // move B data into Kokkos
1702   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1703   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1704   {
1705     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1706     if (!baijkok->diag_d.extent(0)) {
1707       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1708       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1709       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1710     }
1711   }
1712   PetscFunctionReturn(0);
1713 }
1714 
1715 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1716 {
1717   PetscFunctionBegin;
1718   *type = MATSOLVERKOKKOS;
1719   PetscFunctionReturn(0);
1720 }
1721 
1722 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1723 {
1724   PetscFunctionBegin;
1725   *type = MATSOLVERKOKKOSDEVICE;
1726   PetscFunctionReturn(0);
1727 }
1728 
1729 /*MC
1730   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1731   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1732 
1733   Level: beginner
1734 
1735 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1736 M*/
1737 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1738 {
1739   PetscInt       n = A->rmap->n;
1740 
1741   PetscFunctionBegin;
1742   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1743   PetscCall(MatSetSizes(*B,n,n,n,n));
1744   (*B)->factortype = ftype;
1745   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1746   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1747 
1748   if (ftype == MAT_FACTOR_LU) {
1749     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1750     (*B)->canuseordering         = PETSC_TRUE;
1751     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1752   } else if (ftype == MAT_FACTOR_ILU) {
1753     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1754     (*B)->canuseordering         = PETSC_FALSE;
1755     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1756   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1757 
1758   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1759   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1760   PetscFunctionReturn(0);
1761 }
1762 
1763 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1764 {
1765   PetscInt       n = A->rmap->n;
1766 
1767   PetscFunctionBegin;
1768   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1769   PetscCall(MatSetSizes(*B,n,n,n,n));
1770   (*B)->factortype = ftype;
1771   (*B)->canuseordering = PETSC_TRUE;
1772   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1773   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1774 
1775   if (ftype == MAT_FACTOR_LU) {
1776     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1777     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1778   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1779 
1780   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1781   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
1782   PetscFunctionReturn(0);
1783 }
1784 
1785 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1786 {
1787   PetscFunctionBegin;
1788   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
1789   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
1790   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
1791   PetscFunctionReturn(0);
1792 }
1793 
1794 /* Utility to print out a KokkosCsrMatrix for debugging */
1795 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1796 {
1797   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1798   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1799   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1800   const PetscInt    *i = iv.data();
1801   const PetscInt    *j = jv.data();
1802   const PetscScalar *a = av.data();
1803   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1804 
1805   PetscFunctionBegin;
1806   PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1807   for (PetscInt k=0; k<m; k++) {
1808     PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1809     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1810       PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p])));
1811     }
1812     PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n"));
1813   }
1814   PetscFunctionReturn(0);
1815 }
1816