xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision a4158a15ad3e0d6d4d2df436db35861ca309dc5f)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   Mat_SeqAIJ       *aijseq;
29   Mat_SeqAIJKokkos *aijkok;
30 
31   PetscFunctionBegin;
32   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
33   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
34 
35   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
36   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
37 
38   /* If aijkok does not exist, we just copy i, j to device.
39      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
40      In both cases, we build a new aijkok structure.
41   */
42   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
43     delete aijkok;
44     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
45     A->spptr = aijkok;
46   }
47 
48   if (aijkok->device_mat_d.data()) {
49     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
50   }
51   PetscFunctionReturn(0);
52 }
53 
54 /* Sync CSR data to device if not yet */
55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
56 {
57   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
58 
59   PetscFunctionBegin;
60   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
61   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
62   if (aijkok->a_dual.need_sync_device()) {
63     aijkok->a_dual.sync_device();
64     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
65     aijkok->hermitian_updated = PETSC_FALSE;
66   }
67   PetscFunctionReturn(0);
68 }
69 
70 /* Mark the CSR data on device as modified */
71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
72 {
73   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
74 
75   PetscFunctionBegin;
76   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
77   aijkok->a_dual.clear_sync_state();
78   aijkok->a_dual.modify_device();
79   aijkok->transpose_updated = PETSC_FALSE;
80   aijkok->hermitian_updated = PETSC_FALSE;
81   PetscCall(MatSeqAIJInvalidateDiagonal(A));
82   PetscCall(PetscObjectStateIncrease((PetscObject)A));
83   PetscFunctionReturn(0);
84 }
85 
86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
87 {
88   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
89 
90   PetscFunctionBegin;
91   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
92   /* We do not expect one needs factors on host  */
93   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
94   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
95   aijkok->a_dual.sync_host();
96   PetscFunctionReturn(0);
97 }
98 
99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
100 {
101   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
102 
103   PetscFunctionBegin;
104   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
105     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
106     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
107     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
108   */
109   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
110     aijkok->a_dual.sync_host();
111     *array = aijkok->a_dual.view_host().data();
112   } else { /* Happens when calling MatSetValues on a newly created matrix */
113     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
114   }
115   PetscFunctionReturn(0);
116 }
117 
118 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
119 {
120   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
121 
122   PetscFunctionBegin;
123   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
124   PetscFunctionReturn(0);
125 }
126 
127 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
128 {
129   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
130 
131   PetscFunctionBegin;
132   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
133     aijkok->a_dual.sync_host();
134     *array = aijkok->a_dual.view_host().data();
135   } else {
136     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
137   }
138   PetscFunctionReturn(0);
139 }
140 
141 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
142 {
143   PetscFunctionBegin;
144   *array = NULL;
145   PetscFunctionReturn(0);
146 }
147 
148 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
149 {
150   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
151 
152   PetscFunctionBegin;
153   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
154     *array = aijkok->a_dual.view_host().data();
155   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
156     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
157   }
158   PetscFunctionReturn(0);
159 }
160 
161 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
162 {
163   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
164 
165   PetscFunctionBegin;
166   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
167     aijkok->a_dual.clear_sync_state();
168     aijkok->a_dual.modify_host();
169   }
170   PetscFunctionReturn(0);
171 }
172 
173 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
174 {
175   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
176 
177   PetscFunctionBegin;
178   PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL");
179 
180   if (i) *i = aijkok->i_device_data();
181   if (j) *j = aijkok->j_device_data();
182   if (a) {
183     aijkok->a_dual.sync_device();
184     *a = aijkok->a_device_data();
185   }
186   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
187   PetscFunctionReturn(0);
188 }
189 
190 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
191 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
192 {
193   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
194   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
195 
196   PetscFunctionBegin;
197   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
198   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
199   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
200   PetscFunctionReturn(0);
201 }
202 
203 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
204 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
205 {
206   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
207 
208   PetscFunctionBegin;
209   if (aijkok && aijkok->device_mat_d.data()) {
210     *d_mat = aijkok->device_mat_d.data();
211   } else {
212     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
213     *d_mat  = NULL;
214   }
215   PetscFunctionReturn(0);
216 }
217 
218 /* Generate the transpose on device and cache it internally */
219 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
220 {
221   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
222 
223   PetscFunctionBegin;
224   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
225   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
226     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
227     PetscCallCXX(aijkok->a_dual.sync_device());
228     PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
229     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
230     aijkok->transpose_updated = PETSC_TRUE;
231   }
232   *csrmatT = &aijkok->csrmatT;
233   PetscFunctionReturn(0);
234 }
235 
236 /* Generate the Hermitian on device and cache it internally */
237 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
238 {
239   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
240 
241   PetscFunctionBegin;
242   PetscCall(PetscLogGpuTimeBegin());
243   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
244   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
245     PetscCallCXX(aijkok->a_dual.sync_device());
246     PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
247     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
248    #if defined(PETSC_USE_COMPLEX)
249     const auto& a = aijkok->csrmatH.values;
250     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
251    #endif
252     aijkok->hermitian_updated = PETSC_TRUE;
253   }
254   *csrmatH = &aijkok->csrmatH;
255   PetscCall(PetscLogGpuTimeEnd());
256   PetscFunctionReturn(0);
257 }
258 
259 /* y = A x */
260 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
261 {
262   Mat_SeqAIJKokkos                 *aijkok;
263   ConstPetscScalarKokkosView       xv;
264   PetscScalarKokkosView            yv;
265 
266   PetscFunctionBegin;
267   PetscCall(PetscLogGpuTimeBegin());
268   PetscCall(MatSeqAIJKokkosSyncDevice(A));
269   PetscCall(VecGetKokkosView(xx,&xv));
270   PetscCall(VecGetKokkosViewWrite(yy,&yv));
271   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
272   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
273   PetscCall(VecRestoreKokkosView(xx,&xv));
274   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
275   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
276   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
277   PetscCall(PetscLogGpuTimeEnd());
278   PetscFunctionReturn(0);
279 }
280 
281 /* y = A^T x */
282 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
283 {
284   Mat_SeqAIJKokkos                 *aijkok;
285   const char                       *mode;
286   ConstPetscScalarKokkosView       xv;
287   PetscScalarKokkosView            yv;
288   KokkosCsrMatrix                  *csrmat;
289 
290   PetscFunctionBegin;
291   PetscCall(PetscLogGpuTimeBegin());
292   PetscCall(MatSeqAIJKokkosSyncDevice(A));
293   PetscCall(VecGetKokkosView(xx,&xv));
294   PetscCall(VecGetKokkosViewWrite(yy,&yv));
295   if (A->form_explicit_transpose) {
296     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
297     mode = "N";
298   } else {
299     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
300     csrmat = &aijkok->csrmat;
301     mode = "T";
302   }
303   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
304   PetscCall(VecRestoreKokkosView(xx,&xv));
305   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
306   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
307   PetscCall(PetscLogGpuTimeEnd());
308   PetscFunctionReturn(0);
309 }
310 
311 /* y = A^H x */
312 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
313 {
314   Mat_SeqAIJKokkos                 *aijkok;
315   const char                       *mode;
316   ConstPetscScalarKokkosView       xv;
317   PetscScalarKokkosView            yv;
318   KokkosCsrMatrix                  *csrmat;
319 
320   PetscFunctionBegin;
321   PetscCall(PetscLogGpuTimeBegin());
322   PetscCall(MatSeqAIJKokkosSyncDevice(A));
323   PetscCall(VecGetKokkosView(xx,&xv));
324   PetscCall(VecGetKokkosViewWrite(yy,&yv));
325   if (A->form_explicit_transpose) {
326     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
327     mode = "N";
328   } else {
329     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
330     csrmat = &aijkok->csrmat;
331     mode = "C";
332   }
333   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
334   PetscCall(VecRestoreKokkosView(xx,&xv));
335   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
336   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
337   PetscCall(PetscLogGpuTimeEnd());
338   PetscFunctionReturn(0);
339 }
340 
341 /* z = A x + y */
342 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
343 {
344   Mat_SeqAIJKokkos                 *aijkok;
345   ConstPetscScalarKokkosView       xv,yv;
346   PetscScalarKokkosView            zv;
347 
348   PetscFunctionBegin;
349   PetscCall(PetscLogGpuTimeBegin());
350   PetscCall(MatSeqAIJKokkosSyncDevice(A));
351   PetscCall(VecGetKokkosView(xx,&xv));
352   PetscCall(VecGetKokkosView(yy,&yv));
353   PetscCall(VecGetKokkosViewWrite(zz,&zv));
354   if (zz != yy) Kokkos::deep_copy(zv,yv);
355   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
356   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
357   PetscCall(VecRestoreKokkosView(xx,&xv));
358   PetscCall(VecRestoreKokkosView(yy,&yv));
359   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
360   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
361   PetscCall(PetscLogGpuTimeEnd());
362   PetscFunctionReturn(0);
363 }
364 
365 /* z = A^T x + y */
366 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
367 {
368   Mat_SeqAIJKokkos                 *aijkok;
369   const char                       *mode;
370   ConstPetscScalarKokkosView       xv,yv;
371   PetscScalarKokkosView            zv;
372   KokkosCsrMatrix                  *csrmat;
373 
374   PetscFunctionBegin;
375   PetscCall(PetscLogGpuTimeBegin());
376   PetscCall(MatSeqAIJKokkosSyncDevice(A));
377   PetscCall(VecGetKokkosView(xx,&xv));
378   PetscCall(VecGetKokkosView(yy,&yv));
379   PetscCall(VecGetKokkosViewWrite(zz,&zv));
380   if (zz != yy) Kokkos::deep_copy(zv,yv);
381   if (A->form_explicit_transpose) {
382     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
383     mode = "N";
384   } else {
385     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
386     csrmat = &aijkok->csrmat;
387     mode = "T";
388   }
389   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
390   PetscCall(VecRestoreKokkosView(xx,&xv));
391   PetscCall(VecRestoreKokkosView(yy,&yv));
392   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
393   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
394   PetscCall(PetscLogGpuTimeEnd());
395   PetscFunctionReturn(0);
396 }
397 
398 /* z = A^H x + y */
399 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
400 {
401   Mat_SeqAIJKokkos                 *aijkok;
402   const char                       *mode;
403   ConstPetscScalarKokkosView       xv,yv;
404   PetscScalarKokkosView            zv;
405   KokkosCsrMatrix                  *csrmat;
406 
407   PetscFunctionBegin;
408   PetscCall(PetscLogGpuTimeBegin());
409   PetscCall(MatSeqAIJKokkosSyncDevice(A));
410   PetscCall(VecGetKokkosView(xx,&xv));
411   PetscCall(VecGetKokkosView(yy,&yv));
412   PetscCall(VecGetKokkosViewWrite(zz,&zv));
413   if (zz != yy) Kokkos::deep_copy(zv,yv);
414   if (A->form_explicit_transpose) {
415     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
416     mode = "N";
417   } else {
418     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
419     csrmat = &aijkok->csrmat;
420     mode = "C";
421   }
422   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
423   PetscCall(VecRestoreKokkosView(xx,&xv));
424   PetscCall(VecRestoreKokkosView(yy,&yv));
425   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
426   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
427   PetscCall(PetscLogGpuTimeEnd());
428   PetscFunctionReturn(0);
429 }
430 
431 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
432 {
433   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
434 
435   PetscFunctionBegin;
436   switch (op) {
437     case MAT_FORM_EXPLICIT_TRANSPOSE:
438       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
439       if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
440       A->form_explicit_transpose = flg;
441       break;
442     default:
443       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
444       break;
445   }
446   PetscFunctionReturn(0);
447 }
448 
449 /* Depending on reuse, either build a new mat, or use the existing mat */
450 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
451 {
452   Mat_SeqAIJ       *aseq;
453 
454   PetscFunctionBegin;
455   PetscCall(PetscKokkosInitializeCheck());
456   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
457     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
458   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
459     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
460   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
461     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
462     PetscCall(PetscFree(A->defaultvectype));
463     PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
464     PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
465     PetscCall(MatSetOps_SeqAIJKokkos(A));
466     aseq = static_cast<Mat_SeqAIJ*>(A->data);
467     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
468       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
469       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
470     }
471   }
472   PetscFunctionReturn(0);
473 }
474 
475 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
476    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
477  */
478 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
479 {
480   Mat_SeqAIJ            *bseq;
481   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
482   Mat                   mat;
483 
484   PetscFunctionBegin;
485   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
486   PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
487   mat  = *B;
488   if (A->assembled) {
489     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
490     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
491     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
492     /* Now copy values to B if needed */
493     if (dupOption == MAT_COPY_VALUES) {
494       if (akok->a_dual.need_sync_device()) {
495         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
496         bkok->a_dual.modify_host();
497       } else { /* If device has the latest data, we only copy data on device */
498         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
499         bkok->a_dual.modify_device();
500       }
501     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
502       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
503       bkok->a_dual.modify_host();
504     }
505     mat->spptr = bkok;
506   }
507 
508   PetscCall(PetscFree(mat->defaultvectype));
509   PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
510   PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
511   PetscCall(MatSetOps_SeqAIJKokkos(mat));
512   PetscFunctionReturn(0);
513 }
514 
515 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
516 {
517   Mat               At;
518   KokkosCsrMatrix   *internT;
519   Mat_SeqAIJKokkos  *atkok,*bkok;
520 
521   PetscFunctionBegin;
522   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
523   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
524     /* Deep copy internT, as we want to isolate the internal transpose */
525     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
526     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
527     if (reuse == MAT_INITIAL_MATRIX) *B = At;
528     else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
529   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
530     if ((*B)->assembled) {
531       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
532       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
533       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
534     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
535       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
536       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
537       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
538       PetscCallCXX(Kokkos::deep_copy(a_h,internT->values));
539       PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
540     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
541   }
542   PetscFunctionReturn(0);
543 }
544 
545 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
546 {
547   Mat_SeqAIJKokkos           *aijkok;
548 
549   PetscFunctionBegin;
550   if (A->factortype == MAT_FACTOR_NONE) {
551     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
552     delete aijkok;
553   } else {
554     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
555   }
556   A->spptr = NULL;
557   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
558   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
559   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
560   PetscCall(MatDestroy_SeqAIJ(A));
561   PetscFunctionReturn(0);
562 }
563 
564 /*MC
565    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
566 
567    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
568 
569    Options Database Keys:
570 .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
571 
572   Level: beginner
573 
574 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
575 M*/
576 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
577 {
578   PetscFunctionBegin;
579   PetscCall(PetscKokkosInitializeCheck());
580   PetscCall(MatCreate_SeqAIJ(A));
581   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
582   PetscFunctionReturn(0);
583 }
584 
585 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
586 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
587 {
588   Mat_SeqAIJ                   *a,*b;
589   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
590   MatScalarKokkosView          aa,ba,ca;
591   MatRowMapKokkosView          ai,bi,ci;
592   MatColIdxKokkosView          aj,bj,cj;
593   PetscInt                     m,n,nnz,aN;
594 
595   PetscFunctionBegin;
596   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
597   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
598   PetscValidPointer(C,4);
599   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
600   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
601   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
602   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
603 
604   PetscCall(MatSeqAIJKokkosSyncDevice(A));
605   PetscCall(MatSeqAIJKokkosSyncDevice(B));
606   a    = static_cast<Mat_SeqAIJ*>(A->data);
607   b    = static_cast<Mat_SeqAIJ*>(B->data);
608   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
609   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
610   aa   = akok->a_dual.view_device();
611   ai   = akok->i_dual.view_device();
612   ba   = bkok->a_dual.view_device();
613   bi   = bkok->i_dual.view_device();
614   m    = A->rmap->n; /* M, N and nnz of C */
615   n    = A->cmap->n + B->cmap->n;
616   nnz  = a->nz + b->nz;
617   aN   = A->cmap->n; /* N of A */
618   if (reuse == MAT_INITIAL_MATRIX) {
619     aj = akok->j_dual.view_device();
620     bj = bkok->j_dual.view_device();
621     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
622     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
623     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
624     ca = ca_dual.view_device();
625     ci = ci_dual.view_device();
626     cj = cj_dual.view_device();
627 
628     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
629     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
630       PetscInt i = t.league_rank(); /* row i */
631       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
632 
633       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
634         ci(i) = coffset;
635         if (i == m-1) ci(m) = ai(m) + bi(m);
636       });
637 
638       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
639         if (k < alen) {
640           ca(coffset+k) = aa(ai(i)+k);
641           cj(coffset+k) = aj(ai(i)+k);
642         } else {
643           ca(coffset+k) = ba(bi(i)+k-alen);
644           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
645         }
646       });
647     });
648     ca_dual.modify_device();
649     ci_dual.modify_device();
650     cj_dual.modify_device();
651     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
652     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
653   } else if (reuse == MAT_REUSE_MATRIX) {
654     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
655     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
656     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
657     ca   = ckok->a_dual.view_device();
658     ci   = ckok->i_dual.view_device();
659 
660     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
661       PetscInt i = t.league_rank(); /* row i */
662       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
663       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
664         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
665         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
666       });
667     });
668     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
669   }
670   PetscFunctionReturn(0);
671 }
672 
673 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
674 {
675   PetscFunctionBegin;
676   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
677   PetscFunctionReturn(0);
678 }
679 
680 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
681 {
682   Mat_Product                    *product = C->product;
683   Mat                            A,B;
684   bool                           transA,transB; /* use bool, since KK needs this type */
685   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
686   Mat_SeqAIJ                     *c;
687   MatProductData_SeqAIJKokkos    *pdata;
688   KokkosCsrMatrix                *csrmatA,*csrmatB;
689 
690   PetscFunctionBegin;
691   MatCheckProduct(C,1);
692   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
693   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
694 
695   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
696     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
697     PetscFunctionReturn(0);
698   }
699 
700   switch (product->type) {
701     case MATPRODUCT_AB:  transA = false; transB = false; break;
702     case MATPRODUCT_AtB: transA = true;  transB = false; break;
703     case MATPRODUCT_ABt: transA = false; transB = true;  break;
704     default:
705       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
706   }
707 
708   A     = product->A;
709   B     = product->B;
710   PetscCall(MatSeqAIJKokkosSyncDevice(A));
711   PetscCall(MatSeqAIJKokkosSyncDevice(B));
712   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
713   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
714   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
715 
716   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
717 
718   csrmatA = &akok->csrmat;
719   csrmatB = &bkok->csrmat;
720 
721   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
722   if (transA) {
723     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
724     transA = false;
725   }
726 
727   if (transB) {
728     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
729     transB = false;
730   }
731   PetscCall(PetscLogGpuTimeBegin());
732   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
733   PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
734   PetscCall(PetscLogGpuTimeEnd());
735   PetscCall(MatSeqAIJKokkosModifyDevice(C));
736   /* shorter version of MatAssemblyEnd_SeqAIJ */
737   c = (Mat_SeqAIJ*)C->data;
738   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
739   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
740   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
741   c->reallocs         = 0;
742   C->info.mallocs     = 0;
743   C->info.nz_unneeded = 0;
744   C->assembled        = C->was_assembled = PETSC_TRUE;
745   C->num_ass++;
746   PetscFunctionReturn(0);
747 }
748 
749 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
750 {
751   Mat_Product                    *product = C->product;
752   MatProductType                 ptype;
753   Mat                            A,B;
754   bool                           transA,transB;
755   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
756   MatProductData_SeqAIJKokkos    *pdata;
757   MPI_Comm                       comm;
758   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
759 
760   PetscFunctionBegin;
761   MatCheckProduct(C,1);
762   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
763   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
764   A       = product->A;
765   B       = product->B;
766   PetscCall(MatSeqAIJKokkosSyncDevice(A));
767   PetscCall(MatSeqAIJKokkosSyncDevice(B));
768   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
769   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
770   csrmatA = &akok->csrmat;
771   csrmatB = &bkok->csrmat;
772 
773   ptype   = product->type;
774   switch (ptype) {
775     case MATPRODUCT_AB:  transA = false; transB = false; break;
776     case MATPRODUCT_AtB: transA = true;  transB = false; break;
777     case MATPRODUCT_ABt: transA = false; transB = true;  break;
778     default:
779       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
780   }
781 
782   product->data = pdata = new MatProductData_SeqAIJKokkos();
783   pdata->kh.set_team_work_size(16);
784   pdata->kh.set_dynamic_scheduling(true);
785   pdata->reusesym = product->api_user;
786 
787   /* TODO: add command line options to select spgemm algorithms */
788   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
789   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
790      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
791    */
792   pdata->kh.create_spgemm_handle(spgemm_alg);
793 
794   PetscCall(PetscLogGpuTimeBegin());
795   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
796   if (transA) {
797     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
798     transA = false;
799   }
800 
801   if (transB) {
802     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
803     transB = false;
804   }
805 
806   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
807   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
808     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
809     calling new Mat_SeqAIJKokkos().
810     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
811   */
812   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
813   PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC));
814   PetscCall(PetscLogGpuTimeEnd());
815 
816   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
817   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
818   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
819   PetscFunctionReturn(0);
820 }
821 
822 /* handles sparse matrix matrix ops */
823 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
824 {
825   Mat_Product    *product = mat->product;
826   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
827 
828   PetscFunctionBegin;
829   MatCheckProduct(mat,1);
830   PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
831   if (product->type == MATPRODUCT_ABC) {
832     PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
833   }
834   if (Biskok && Ciskok) {
835     switch (product->type) {
836     case MATPRODUCT_AB:
837     case MATPRODUCT_AtB:
838     case MATPRODUCT_ABt:
839       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
840       break;
841     case MATPRODUCT_PtAP:
842     case MATPRODUCT_RARt:
843     case MATPRODUCT_ABC:
844       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
845       break;
846     default:
847       break;
848     }
849   } else { /* fallback for AIJ */
850     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
851   }
852   PetscFunctionReturn(0);
853 }
854 
855 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
856 {
857   Mat_SeqAIJKokkos *aijkok;
858 
859   PetscFunctionBegin;
860   PetscCall(PetscLogGpuTimeBegin());
861   PetscCall(MatSeqAIJKokkosSyncDevice(A));
862   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
863   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
864   PetscCall(MatSeqAIJKokkosModifyDevice(A));
865   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
866   PetscCall(PetscLogGpuTimeEnd());
867   PetscFunctionReturn(0);
868 }
869 
870 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
871 {
872   Mat_SeqAIJKokkos *aijkok;
873 
874   PetscFunctionBegin;
875   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
876   if (aijkok) { /* Only zero the device if data is already there */
877     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
878     PetscCall(MatSeqAIJKokkosModifyDevice(A));
879   } else { /* Might be preallocated but not assembled */
880     PetscCall(MatZeroEntries_SeqAIJ(A));
881   }
882   PetscFunctionReturn(0);
883 }
884 
885 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
886 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
887 {
888   Mat_SeqAIJKokkos   *aijkok;
889 
890   PetscFunctionBegin;
891   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
892   PetscValidPointer(kv,2);
893   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
894   PetscCall(MatSeqAIJKokkosSyncDevice(A));
895   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
896   *kv    = aijkok->a_dual.view_device();
897   PetscFunctionReturn(0);
898 }
899 
900 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
901 {
902   PetscFunctionBegin;
903   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
904   PetscValidPointer(kv,2);
905   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
906   PetscFunctionReturn(0);
907 }
908 
909 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
910 {
911   Mat_SeqAIJKokkos   *aijkok;
912 
913   PetscFunctionBegin;
914   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
915   PetscValidPointer(kv,2);
916   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
917   PetscCall(MatSeqAIJKokkosSyncDevice(A));
918   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
919   *kv    = aijkok->a_dual.view_device();
920   PetscFunctionReturn(0);
921 }
922 
923 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
924 {
925   PetscFunctionBegin;
926   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
927   PetscValidPointer(kv,2);
928   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
929   PetscCall(MatSeqAIJKokkosModifyDevice(A));
930   PetscFunctionReturn(0);
931 }
932 
933 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
934 {
935   Mat_SeqAIJKokkos   *aijkok;
936 
937   PetscFunctionBegin;
938   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
939   PetscValidPointer(kv,2);
940   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
941   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
942   *kv    = aijkok->a_dual.view_device();
943   PetscFunctionReturn(0);
944 }
945 
946 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
947 {
948   PetscFunctionBegin;
949   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
950   PetscValidPointer(kv,2);
951   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
952   PetscCall(MatSeqAIJKokkosModifyDevice(A));
953   PetscFunctionReturn(0);
954 }
955 
956 /* Computes Y += alpha X */
957 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
958 {
959   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
960   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
961   ConstMatScalarKokkosView   Xa;
962   MatScalarKokkosView        Ya;
963 
964   PetscFunctionBegin;
965   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
966   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
967   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
968   PetscCall(MatSeqAIJKokkosSyncDevice(X));
969   PetscCall(PetscLogGpuTimeBegin());
970 
971   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
972     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
973     PetscBool e;
974     PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
975     if (e) {
976       PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e));
977       if (e) pattern = SAME_NONZERO_PATTERN;
978     }
979   }
980 
981   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
982     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
983     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
984     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
985   */
986   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
987   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
988   Xa   = xkok->a_dual.view_device();
989   Ya   = ykok->a_dual.view_device();
990 
991   if (pattern == SAME_NONZERO_PATTERN) {
992     KokkosBlas::axpy(alpha,Xa,Ya);
993     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
994   } else if (pattern == SUBSET_NONZERO_PATTERN) {
995     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
996     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
997 
998     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
999       PetscInt i = t.league_rank(); /* row i */
1000       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1001         PetscInt p,q = Yi(i);
1002         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1003           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1004           if (Xj(p) == Yj(q)) { /* Found it */
1005             Ya(q) += alpha * Xa(p);
1006             q++;
1007           } else {
1008             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1009                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1010             */
1011             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1012           }
1013         }
1014       });
1015     });
1016     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1017   } else { /* different nonzero patterns */
1018     Mat             Z;
1019     KokkosCsrMatrix zcsr;
1020     KernelHandle    kh;
1021     kh.create_spadd_handle(false);
1022     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1023     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1024     zkok = new Mat_SeqAIJKokkos(zcsr);
1025     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
1026     PetscCall(MatHeaderReplace(Y,&Z));
1027     kh.destroy_spadd_handle();
1028   }
1029   PetscCall(PetscLogGpuTimeEnd());
1030   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1031   PetscFunctionReturn(0);
1032 }
1033 
1034 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1035 {
1036   Mat_SeqAIJKokkos *akok;
1037   Mat_SeqAIJ       *aseq;
1038 
1039   PetscFunctionBegin;
1040   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1041   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1042   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1043   delete akok;
1044   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
1045   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1046   akok->SetUpCOO(aseq);
1047   PetscFunctionReturn(0);
1048 }
1049 
1050 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1051 {
1052   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1053   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1054   PetscCount                  Annz = aseq->nz;
1055   const PetscCountKokkosView& jmap = akok->jmap_d;
1056   const PetscCountKokkosView& perm = akok->perm_d;
1057   MatScalarKokkosView         Aa;
1058   ConstMatScalarKokkosView    kv;
1059   PetscMemType                memtype;
1060 
1061   PetscFunctionBegin;
1062   PetscCall(PetscGetMemType(v,&memtype));
1063   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1064     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1065   } else {
1066     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1067   }
1068 
1069   if (imode == INSERT_VALUES) {
1070     PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1071     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1072   } else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
1073 
1074   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));});
1075 
1076   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1077   else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa));
1078   PetscFunctionReturn(0);
1079 }
1080 
1081 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag)
1082 {
1083   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1084   MatScalarKokkosView         Aa;
1085   const MatRowMapKokkosView&  Ai = akok->i_dual.view_device();
1086   PetscInt                    m = A->rmap->n;
1087   ConstMatRowMapKokkosView    Adiag(diag,m); /* diag is a device pointer */
1088 
1089   PetscFunctionBegin;
1090   PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa));
1091   Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) {
1092     PetscScalar tmp;
1093     if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */
1094       tmp          = Aa(Ai(i));
1095       Aa(Ai(i))    = Aa(Adiag(i));
1096       Aa(Adiag(i)) = tmp;
1097     }
1098   });
1099   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1100   PetscFunctionReturn(0);
1101 }
1102 
1103 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1104 {
1105   PetscFunctionBegin;
1106   PetscCall(MatSeqAIJKokkosSyncHost(A));
1107   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
1108   B->offloadmask = PETSC_OFFLOAD_CPU;
1109   PetscFunctionReturn(0);
1110 }
1111 
1112 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1113 {
1114   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1115 
1116   PetscFunctionBegin;
1117   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1118   A->boundtocpu  = PETSC_FALSE;
1119 
1120   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1121   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1122   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1123   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1124   A->ops->scale                     = MatScale_SeqAIJKokkos;
1125   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1126   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1127   A->ops->mult                      = MatMult_SeqAIJKokkos;
1128   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1129   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1130   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1131   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1132   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1133   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1134   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1135   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1136   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1137   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1138   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1139   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1140   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1141   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1142   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1143 
1144   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
1145   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1146   PetscFunctionReturn(0);
1147 }
1148 
1149 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1150 {
1151   Mat_SeqAIJ *aseq;
1152   PetscInt    i,m,n;
1153 
1154   PetscFunctionBegin;
1155   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1156 
1157   m = akok->nrows();
1158   n = akok->ncols();
1159   PetscCall(MatSetSizes(A,m,n,m,n));
1160   PetscCall(MatSetType(A,MATSEQAIJKOKKOS));
1161 
1162   /* Set up data structures of A as a MATSEQAIJ */
1163   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1164   aseq = (Mat_SeqAIJ*)(A)->data;
1165 
1166   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1167   akok->j_dual.sync_host();
1168 
1169   aseq->i            = akok->i_host_data();
1170   aseq->j            = akok->j_host_data();
1171   aseq->a            = akok->a_host_data();
1172   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1173   aseq->singlemalloc = PETSC_FALSE;
1174   aseq->free_a       = PETSC_FALSE;
1175   aseq->free_ij      = PETSC_FALSE;
1176   aseq->nz           = akok->nnz();
1177   aseq->maxnz        = aseq->nz;
1178 
1179   PetscCall(PetscMalloc1(m,&aseq->imax));
1180   PetscCall(PetscMalloc1(m,&aseq->ilen));
1181   for (i=0; i<m; i++) {
1182     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1183   }
1184 
1185   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1186   akok->nonzerostate = A->nonzerostate;
1187   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1188   PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
1189   PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1190   PetscFunctionReturn(0);
1191 }
1192 
1193 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1194 
1195    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1196  */
1197 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1198 {
1199   PetscFunctionBegin;
1200   PetscCall(MatCreate(comm,A));
1201   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
1202   PetscFunctionReturn(0);
1203 }
1204 
1205 /* --------------------------------------------------------------------------------*/
1206 /*@C
1207    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1208    (the default parallel PETSc format). This matrix will ultimately be handled by
1209    Kokkos for calculations. For good matrix
1210    assembly performance the user should preallocate the matrix storage by setting
1211    the parameter nz (or the array nnz).  By setting these parameters accurately,
1212    performance during matrix assembly can be increased by more than a factor of 50.
1213 
1214    Collective
1215 
1216    Input Parameters:
1217 +  comm - MPI communicator, set to PETSC_COMM_SELF
1218 .  m - number of rows
1219 .  n - number of columns
1220 .  nz - number of nonzeros per row (same for all rows)
1221 -  nnz - array containing the number of nonzeros in the various rows
1222          (possibly different for each row) or NULL
1223 
1224    Output Parameter:
1225 .  A - the matrix
1226 
1227    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1228    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1229    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1230 
1231    Notes:
1232    If nnz is given then nz is ignored
1233 
1234    The AIJ format (also called the Yale sparse matrix format or
1235    compressed row storage), is fully compatible with standard Fortran 77
1236    storage.  That is, the stored row and column indices can begin at
1237    either one (as in Fortran) or zero.  See the users' manual for details.
1238 
1239    Specify the preallocated storage with either nz or nnz (not both).
1240    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1241    allocation.  For large problems you MUST preallocate memory or you
1242    will get TERRIBLE performance, see the users' manual chapter on matrices.
1243 
1244    By default, this format uses inodes (identical nodes) when possible, to
1245    improve numerical efficiency of matrix-vector products and solves. We
1246    search for consecutive rows with the same nonzero structure, thereby
1247    reusing matrix information to achieve increased efficiency.
1248 
1249    Level: intermediate
1250 
1251 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1252 @*/
1253 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1254 {
1255   PetscFunctionBegin;
1256   PetscCall(PetscKokkosInitializeCheck());
1257   PetscCall(MatCreate(comm,A));
1258   PetscCall(MatSetSizes(*A,m,n,m,n));
1259   PetscCall(MatSetType(*A,MATSEQAIJKOKKOS));
1260   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
1261   PetscFunctionReturn(0);
1262 }
1263 
1264 typedef Kokkos::TeamPolicy<>::member_type team_member;
1265 //
1266 // This factorization exploits block diagonal matrices with "Nf" (not used).
1267 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1268 //
1269 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1270 {
1271   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1272   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1273   IS                 isrow = b->row,isicol = b->icol;
1274   const PetscInt     *r_h,*ic_h;
1275   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1276   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1277   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1278   PetscBool          row_identity,col_identity;
1279   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1280 
1281   PetscFunctionBegin;
1282   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1283   PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity));
1284   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1285   PetscCall(ISGetIndices(isrow,&r_h));
1286   PetscCall(ISGetIndices(isicol,&ic_h));
1287   PetscCall(ISGetSize(isicol,&nc));
1288   PetscCall(PetscLogGpuTimeBegin());
1289   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1290   {
1291 #define KOKKOS_SHARED_LEVEL 1
1292     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1293     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1294     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1295     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1296     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1297     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1298     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1299     size_t flops_h = 0.0;
1300     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1301     Kokkos::View<size_t> d_flops_k ("flops");
1302     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1303     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1304     Kokkos::deep_copy (d_flops_k, h_flops_k);
1305     Kokkos::deep_copy (d_r_k, h_r_k);
1306     Kokkos::deep_copy (d_ic_k, h_ic_k);
1307     // Fill A --> fact
1308     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1309         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1310         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1311         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1312         // zero rows of B
1313         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1314             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1315             PetscScalar *baL = ba_d + bi_d[rowb];
1316             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1317             /* zero (unfactored row) */
1318             for (int j=0;j<nzbL;j++) baL[j] = 0;
1319             for (int j=0;j<nzbU;j++) baU[j] = 0;
1320           });
1321         // copy A into B
1322         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1323             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1324             const PetscScalar *av    = aa_d + ai_d[rowa];
1325             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1326             /* load in initial (unfactored row) */
1327             for (int j=0;j<nza;j++) {
1328               PetscInt    colb = ic[ajtmp[j]];
1329               PetscScalar vala = av[j];
1330               if (colb == rowb) {
1331                 *(ba_d + bdiag_d[rowb]) = vala;
1332               } else {
1333                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1334                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1335                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1336                 for (int j=0; j<nz ; j++) {
1337                   if (pbj[j] == colb) {
1338                     pba[j] = vala;
1339                     set++;
1340                     break;
1341                   }
1342                 }
1343                #if !defined(PETSC_HAVE_SYCL)
1344                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1345                #endif
1346               }
1347             }
1348           });
1349       });
1350     Kokkos::fence();
1351 
1352     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1353         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1354         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1355         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1356         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1357         const PetscInt  start = field*nloc, end = start + nloc;
1358         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1359         // A22 panel update for each row A(1,:) and col A(:,1)
1360         for (int ii=start; ii<end-1; ii++) {
1361           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1362           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1363           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1364           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1365           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1366               PetscInt kIdx = j*Ni + field_block_idx;
1367               if (kIdx >= nzUi) /* void */ ;
1368               else {
1369                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1370                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1371                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1372                 size_t         st_idx;
1373                 // find and do L(k,i) = A(:k,i) / A(i,i)
1374                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1375                 // get column, there has got to be a better way
1376                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1377                     if (pjL[j] == ii) {
1378                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1379                       idx = j; // output
1380                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1381                     }
1382                 }, st_idx);
1383                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1384 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1385                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1386 #endif
1387                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1388                 // U(i+1,:end)
1389                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1390                       PetscScalar Uij = baUi[uiIdx];
1391                       PetscInt    col = bjUi[uiIdx];
1392                       if (col==myk) {
1393                         // A_kk = A_kk - L_ki * U_ij(k)
1394                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1395                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1396                       } else {
1397                         PetscScalar    *start, *end, *pAkjv=NULL;
1398                         PetscInt       high, low;
1399                         const PetscInt *startj;
1400                         if (col<myk) { // L
1401                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1402                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1403                           start = pLki+1; // start at pLki+1, A22(myk,1)
1404                           startj= bj_d + bi_d[myk] + idx;
1405                           end   = ba_d + bi_d[myk+1];
1406                         } else {
1407                           PetscInt idx = bdiag_d[myk+1]+1;
1408                           start = ba_d + idx;
1409                           startj= bj_d + idx;
1410                           end   = ba_d + bdiag_d[myk];
1411                         }
1412                         // search for 'col', use bisection search - TODO
1413                         low  = 0;
1414                         high = (PetscInt)(end-start);
1415                         while (high-low > 5) {
1416                           int t = (low+high)/2;
1417                           if (startj[t] > col) high = t;
1418                           else                 low  = t;
1419                         }
1420                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1421                           if (startj[pAkjv-start] == col) break;
1422                         }
1423 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1424                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1425 #endif
1426                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1427                       }
1428                     });
1429               }
1430             });
1431           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1432           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1433         } /* endof for (i=0; i<n; i++) { */
1434         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1435       });
1436     Kokkos::fence();
1437     Kokkos::deep_copy (h_flops_k, d_flops_k);
1438     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1439     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1440         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1441         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1442         /* Invert diagonal for simpler triangular solves */
1443         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1444             int i = start + outer_index*Ni + lg_rank%Ni;
1445             if (i < end) {
1446               PetscScalar *pv = ba_d + bdiag_d[i];
1447               *pv = 1.0/(*pv);
1448             }
1449           });
1450       });
1451   }
1452   PetscCall(PetscLogGpuTimeEnd());
1453   PetscCall(ISRestoreIndices(isicol,&ic_h));
1454   PetscCall(ISRestoreIndices(isrow,&r_h));
1455 
1456   PetscCall(ISIdentity(isrow,&row_identity));
1457   PetscCall(ISIdentity(isicol,&col_identity));
1458   if (b->inode.size) {
1459     B->ops->solve = MatSolve_SeqAIJ_Inode;
1460   } else if (row_identity && col_identity) {
1461     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1462   } else {
1463     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1464   }
1465   B->offloadmask = PETSC_OFFLOAD_GPU;
1466   PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
1467   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1468   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1469   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1470   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1471   B->assembled              = PETSC_TRUE;
1472   B->preallocated           = PETSC_TRUE;
1473 
1474   PetscFunctionReturn(0);
1475 }
1476 
1477 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1478 {
1479   PetscFunctionBegin;
1480   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1481   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1482   PetscFunctionReturn(0);
1483 }
1484 
1485 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1486 {
1487   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1488 
1489   PetscFunctionBegin;
1490   if (!factors->sptrsv_symbolic_completed) {
1491     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1492     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1493     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1494   }
1495   PetscFunctionReturn(0);
1496 }
1497 
1498 /* Check if we need to update factors etc for transpose solve */
1499 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1500 {
1501   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1502   MatColIdxType             n = A->rmap->n;
1503 
1504   PetscFunctionBegin;
1505   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1506     /* Update L^T and do sptrsv symbolic */
1507     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1508     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1509     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1510     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1511 
1512     KokkosKernels::Impl::transpose_matrix<
1513       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1514       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1515       MatRowMapKokkosView,DefaultExecutionSpace>(
1516         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1517         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1518 
1519     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1520       We have to sort the indices, until KK provides finer control options.
1521     */
1522     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1523       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1524         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1525 
1526     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1527 
1528     /* Update U^T and do sptrsv symbolic */
1529     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1530     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1531     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1532     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1533 
1534     KokkosKernels::Impl::transpose_matrix<
1535       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1536       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1537       MatRowMapKokkosView,DefaultExecutionSpace>(
1538         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1539         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1540 
1541     /* Sort indices. See comments above */
1542     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1543       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1544         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1545 
1546     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1547     factors->transpose_updated = PETSC_TRUE;
1548   }
1549   PetscFunctionReturn(0);
1550 }
1551 
1552 /* Solve Ax = b, with A = LU */
1553 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1554 {
1555   ConstPetscScalarKokkosView     bv;
1556   PetscScalarKokkosView          xv;
1557   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1558 
1559   PetscFunctionBegin;
1560   PetscCall(PetscLogGpuTimeBegin());
1561   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1562   PetscCall(VecGetKokkosView(b,&bv));
1563   PetscCall(VecGetKokkosViewWrite(x,&xv));
1564   /* Solve L tmpv = b */
1565   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1566   /* Solve Ux = tmpv */
1567   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1568   PetscCall(VecRestoreKokkosView(b,&bv));
1569   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1570   PetscCall(PetscLogGpuTimeEnd());
1571   PetscFunctionReturn(0);
1572 }
1573 
1574 /* Solve A^T x = b, where A^T = U^T L^T */
1575 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1576 {
1577   ConstPetscScalarKokkosView     bv;
1578   PetscScalarKokkosView          xv;
1579   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1580 
1581   PetscFunctionBegin;
1582   PetscCall(PetscLogGpuTimeBegin());
1583   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1584   PetscCall(VecGetKokkosView(b,&bv));
1585   PetscCall(VecGetKokkosViewWrite(x,&xv));
1586   /* Solve U^T tmpv = b */
1587   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1588 
1589   /* Solve L^T x = tmpv */
1590   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1591   PetscCall(VecRestoreKokkosView(b,&bv));
1592   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1593   PetscCall(PetscLogGpuTimeEnd());
1594   PetscFunctionReturn(0);
1595 }
1596 
1597 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1598 {
1599   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1600   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1601   PetscInt                       fill_lev = info->levels;
1602 
1603   PetscFunctionBegin;
1604   PetscCall(PetscLogGpuTimeBegin());
1605   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1606 
1607   auto a_d = aijkok->a_dual.view_device();
1608   auto i_d = aijkok->i_dual.view_device();
1609   auto j_d = aijkok->j_dual.view_device();
1610 
1611   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1612 
1613   B->assembled                       = PETSC_TRUE;
1614   B->preallocated                    = PETSC_TRUE;
1615   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1616   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1617   B->ops->matsolve                   = NULL;
1618   B->ops->matsolvetranspose          = NULL;
1619   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1620 
1621   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1622   factors->transpose_updated         = PETSC_FALSE;
1623   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1624   /* TODO: log flops, but how to know that? */
1625   PetscCall(PetscLogGpuTimeEnd());
1626   PetscFunctionReturn(0);
1627 }
1628 
1629 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1630 {
1631   Mat_SeqAIJKokkos               *aijkok;
1632   Mat_SeqAIJ                     *b;
1633   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1634   PetscInt                       fill_lev = info->levels;
1635   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1636   PetscInt                       n = A->rmap->n;
1637 
1638   PetscFunctionBegin;
1639   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1640   /* Rebuild factors */
1641   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1642   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1643 
1644   /* Create a spiluk handle and then do symbolic factorization */
1645   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1646   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1647 
1648   auto spiluk_handle = factors->kh.get_spiluk_handle();
1649 
1650   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1651   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1652   Kokkos::realloc(factors->iU_d,n+1);
1653   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1654 
1655   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1656   auto i_d = aijkok->i_dual.view_device();
1657   auto j_d = aijkok->j_dual.view_device();
1658   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1659   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1660 
1661   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1662   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1663   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1664   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1665 
1666   /* TODO: add options to select sptrsv algorithms */
1667   /* Create sptrsv handles for L, U and their transpose */
1668  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1669   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1670  #else
1671   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1672  #endif
1673 
1674   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1675   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1676   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1677   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1678 
1679   /* Fill fields of the factor matrix B */
1680   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
1681   b     = (Mat_SeqAIJ*)B->data;
1682   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1683   B->info.fill_ratio_given  = info->fill;
1684   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1685 
1686   B->offloadmask            = PETSC_OFFLOAD_GPU;
1687   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1688   PetscFunctionReturn(0);
1689 }
1690 
1691 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1692 {
1693   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1694   const PetscInt   nrows   = A->rmap->n;
1695 
1696   PetscFunctionBegin;
1697   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1698   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1699   // move B data into Kokkos
1700   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1701   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1702   {
1703     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1704     if (!baijkok->diag_d.extent(0)) {
1705       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1706       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1707       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1708     }
1709   }
1710   PetscFunctionReturn(0);
1711 }
1712 
1713 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1714 {
1715   PetscFunctionBegin;
1716   *type = MATSOLVERKOKKOS;
1717   PetscFunctionReturn(0);
1718 }
1719 
1720 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1721 {
1722   PetscFunctionBegin;
1723   *type = MATSOLVERKOKKOSDEVICE;
1724   PetscFunctionReturn(0);
1725 }
1726 
1727 /*MC
1728   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1729   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1730 
1731   Level: beginner
1732 
1733 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1734 M*/
1735 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1736 {
1737   PetscInt       n = A->rmap->n;
1738 
1739   PetscFunctionBegin;
1740   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1741   PetscCall(MatSetSizes(*B,n,n,n,n));
1742   (*B)->factortype = ftype;
1743   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1744   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1745 
1746   if (ftype == MAT_FACTOR_LU) {
1747     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1748     (*B)->canuseordering         = PETSC_TRUE;
1749     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1750   } else if (ftype == MAT_FACTOR_ILU) {
1751     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1752     (*B)->canuseordering         = PETSC_FALSE;
1753     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1754   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1755 
1756   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1757   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1758   PetscFunctionReturn(0);
1759 }
1760 
1761 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1762 {
1763   PetscInt       n = A->rmap->n;
1764 
1765   PetscFunctionBegin;
1766   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1767   PetscCall(MatSetSizes(*B,n,n,n,n));
1768   (*B)->factortype = ftype;
1769   (*B)->canuseordering = PETSC_TRUE;
1770   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1771   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1772 
1773   if (ftype == MAT_FACTOR_LU) {
1774     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1775     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1776   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1777 
1778   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1779   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
1780   PetscFunctionReturn(0);
1781 }
1782 
1783 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1784 {
1785   PetscFunctionBegin;
1786   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
1787   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
1788   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
1789   PetscFunctionReturn(0);
1790 }
1791 
1792 /* Utility to print out a KokkosCsrMatrix for debugging */
1793 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1794 {
1795   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1796   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1797   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1798   const PetscInt    *i = iv.data();
1799   const PetscInt    *j = jv.data();
1800   const PetscScalar *a = av.data();
1801   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1802 
1803   PetscFunctionBegin;
1804   PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1805   for (PetscInt k=0; k<m; k++) {
1806     PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1807     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1808       PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p])));
1809     }
1810     PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n"));
1811   }
1812   PetscFunctionReturn(0);
1813 }
1814