xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 5f9db2b25dbeb4dbaf539bdb264ccb4c01c47bc2)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
18 
19 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3,6,99)
20   #include <KokkosSparse_Utils.hpp>
21   using KokkosSparse::Impl::transpose_matrix;
22   using KokkosSparse::sort_crs_matrix;
23 #else
24   #include <KokkosKernels_Sorting.hpp>
25   using KokkosKernels::Impl::transpose_matrix;
26   using KokkosKernels::sort_crs_matrix;
27 #endif
28 
29 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
30 
31 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33    In the latter case, it is important to set a_dual's sync state correctly.
34  */
35 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
36 {
37   Mat_SeqAIJ       *aijseq;
38   Mat_SeqAIJKokkos *aijkok;
39 
40   PetscFunctionBegin;
41   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
42   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
43 
44   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
45   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
46 
47   /* If aijkok does not exist, we just copy i, j to device.
48      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
49      In both cases, we build a new aijkok structure.
50   */
51   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
52     delete aijkok;
53     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
54     A->spptr = aijkok;
55   }
56 
57   if (aijkok->device_mat_d.data()) {
58     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
59   }
60   PetscFunctionReturn(0);
61 }
62 
63 /* Sync CSR data to device if not yet */
64 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
65 {
66   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
67 
68   PetscFunctionBegin;
69   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
70   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
71   if (aijkok->a_dual.need_sync_device()) {
72     aijkok->a_dual.sync_device();
73     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
74     aijkok->hermitian_updated = PETSC_FALSE;
75   }
76   PetscFunctionReturn(0);
77 }
78 
79 /* Mark the CSR data on device as modified */
80 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
81 {
82   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
83 
84   PetscFunctionBegin;
85   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
86   aijkok->a_dual.clear_sync_state();
87   aijkok->a_dual.modify_device();
88   aijkok->transpose_updated = PETSC_FALSE;
89   aijkok->hermitian_updated = PETSC_FALSE;
90   PetscCall(MatSeqAIJInvalidateDiagonal(A));
91   PetscCall(PetscObjectStateIncrease((PetscObject)A));
92   PetscFunctionReturn(0);
93 }
94 
95 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
96 {
97   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
98 
99   PetscFunctionBegin;
100   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
101   /* We do not expect one needs factors on host  */
102   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
103   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
104   aijkok->a_dual.sync_host();
105   PetscFunctionReturn(0);
106 }
107 
108 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
109 {
110   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
111 
112   PetscFunctionBegin;
113   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
114     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
115     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
116     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
117   */
118   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
119     aijkok->a_dual.sync_host();
120     *array = aijkok->a_dual.view_host().data();
121   } else { /* Happens when calling MatSetValues on a newly created matrix */
122     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
123   }
124   PetscFunctionReturn(0);
125 }
126 
127 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
128 {
129   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
130 
131   PetscFunctionBegin;
132   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
133   PetscFunctionReturn(0);
134 }
135 
136 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
137 {
138   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
139 
140   PetscFunctionBegin;
141   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
142     aijkok->a_dual.sync_host();
143     *array = aijkok->a_dual.view_host().data();
144   } else {
145     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
146   }
147   PetscFunctionReturn(0);
148 }
149 
150 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
151 {
152   PetscFunctionBegin;
153   *array = NULL;
154   PetscFunctionReturn(0);
155 }
156 
157 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
158 {
159   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
160 
161   PetscFunctionBegin;
162   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163     *array = aijkok->a_dual.view_host().data();
164   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
165     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
166   }
167   PetscFunctionReturn(0);
168 }
169 
170 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
171 {
172   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
173 
174   PetscFunctionBegin;
175   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
176     aijkok->a_dual.clear_sync_state();
177     aijkok->a_dual.modify_host();
178   }
179   PetscFunctionReturn(0);
180 }
181 
182 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
183 {
184   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
185 
186   PetscFunctionBegin;
187   PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL");
188 
189   if (i) *i = aijkok->i_device_data();
190   if (j) *j = aijkok->j_device_data();
191   if (a) {
192     aijkok->a_dual.sync_device();
193     *a = aijkok->a_device_data();
194   }
195   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
196   PetscFunctionReturn(0);
197 }
198 
199 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
200 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
201 {
202   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
203   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
204 
205   PetscFunctionBegin;
206   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
208   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
209   PetscFunctionReturn(0);
210 }
211 
212 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
213 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
214 {
215   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
216 
217   PetscFunctionBegin;
218   if (aijkok && aijkok->device_mat_d.data()) {
219     *d_mat = aijkok->device_mat_d.data();
220   } else {
221     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
222     *d_mat  = NULL;
223   }
224   PetscFunctionReturn(0);
225 }
226 
227 /* Generate the transpose on device and cache it internally */
228 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
229 {
230   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
231 
232   PetscFunctionBegin;
233   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
234   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
235     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
236     PetscCallCXX(aijkok->a_dual.sync_device());
237     PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat));
238     PetscCallCXX(sort_crs_matrix(aijkok->csrmatT));
239     aijkok->transpose_updated = PETSC_TRUE;
240   }
241   *csrmatT = &aijkok->csrmatT;
242   PetscFunctionReturn(0);
243 }
244 
245 /* Generate the Hermitian on device and cache it internally */
246 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
247 {
248   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
249 
250   PetscFunctionBegin;
251   PetscCall(PetscLogGpuTimeBegin());
252   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
253   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
254     PetscCallCXX(aijkok->a_dual.sync_device());
255     PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat));
256     PetscCallCXX(sort_crs_matrix(aijkok->csrmatH));
257    #if defined(PETSC_USE_COMPLEX)
258     const auto& a = aijkok->csrmatH.values;
259     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
260    #endif
261     aijkok->hermitian_updated = PETSC_TRUE;
262   }
263   *csrmatH = &aijkok->csrmatH;
264   PetscCall(PetscLogGpuTimeEnd());
265   PetscFunctionReturn(0);
266 }
267 
268 /* y = A x */
269 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
270 {
271   Mat_SeqAIJKokkos                 *aijkok;
272   ConstPetscScalarKokkosView       xv;
273   PetscScalarKokkosView            yv;
274 
275   PetscFunctionBegin;
276   PetscCall(PetscLogGpuTimeBegin());
277   PetscCall(MatSeqAIJKokkosSyncDevice(A));
278   PetscCall(VecGetKokkosView(xx,&xv));
279   PetscCall(VecGetKokkosViewWrite(yy,&yv));
280   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
281   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
282   PetscCall(VecRestoreKokkosView(xx,&xv));
283   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
284   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
285   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
286   PetscCall(PetscLogGpuTimeEnd());
287   PetscFunctionReturn(0);
288 }
289 
290 /* y = A^T x */
291 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
292 {
293   Mat_SeqAIJKokkos                 *aijkok;
294   const char                       *mode;
295   ConstPetscScalarKokkosView       xv;
296   PetscScalarKokkosView            yv;
297   KokkosCsrMatrix                  *csrmat;
298 
299   PetscFunctionBegin;
300   PetscCall(PetscLogGpuTimeBegin());
301   PetscCall(MatSeqAIJKokkosSyncDevice(A));
302   PetscCall(VecGetKokkosView(xx,&xv));
303   PetscCall(VecGetKokkosViewWrite(yy,&yv));
304   if (A->form_explicit_transpose) {
305     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
306     mode = "N";
307   } else {
308     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
309     csrmat = &aijkok->csrmat;
310     mode = "T";
311   }
312   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
313   PetscCall(VecRestoreKokkosView(xx,&xv));
314   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
315   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
316   PetscCall(PetscLogGpuTimeEnd());
317   PetscFunctionReturn(0);
318 }
319 
320 /* y = A^H x */
321 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
322 {
323   Mat_SeqAIJKokkos                 *aijkok;
324   const char                       *mode;
325   ConstPetscScalarKokkosView       xv;
326   PetscScalarKokkosView            yv;
327   KokkosCsrMatrix                  *csrmat;
328 
329   PetscFunctionBegin;
330   PetscCall(PetscLogGpuTimeBegin());
331   PetscCall(MatSeqAIJKokkosSyncDevice(A));
332   PetscCall(VecGetKokkosView(xx,&xv));
333   PetscCall(VecGetKokkosViewWrite(yy,&yv));
334   if (A->form_explicit_transpose) {
335     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
336     mode = "N";
337   } else {
338     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
339     csrmat = &aijkok->csrmat;
340     mode = "C";
341   }
342   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
343   PetscCall(VecRestoreKokkosView(xx,&xv));
344   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
345   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
346   PetscCall(PetscLogGpuTimeEnd());
347   PetscFunctionReturn(0);
348 }
349 
350 /* z = A x + y */
351 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
352 {
353   Mat_SeqAIJKokkos                 *aijkok;
354   ConstPetscScalarKokkosView       xv,yv;
355   PetscScalarKokkosView            zv;
356 
357   PetscFunctionBegin;
358   PetscCall(PetscLogGpuTimeBegin());
359   PetscCall(MatSeqAIJKokkosSyncDevice(A));
360   PetscCall(VecGetKokkosView(xx,&xv));
361   PetscCall(VecGetKokkosView(yy,&yv));
362   PetscCall(VecGetKokkosViewWrite(zz,&zv));
363   if (zz != yy) Kokkos::deep_copy(zv,yv);
364   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
365   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
366   PetscCall(VecRestoreKokkosView(xx,&xv));
367   PetscCall(VecRestoreKokkosView(yy,&yv));
368   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
369   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
370   PetscCall(PetscLogGpuTimeEnd());
371   PetscFunctionReturn(0);
372 }
373 
374 /* z = A^T x + y */
375 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
376 {
377   Mat_SeqAIJKokkos                 *aijkok;
378   const char                       *mode;
379   ConstPetscScalarKokkosView       xv,yv;
380   PetscScalarKokkosView            zv;
381   KokkosCsrMatrix                  *csrmat;
382 
383   PetscFunctionBegin;
384   PetscCall(PetscLogGpuTimeBegin());
385   PetscCall(MatSeqAIJKokkosSyncDevice(A));
386   PetscCall(VecGetKokkosView(xx,&xv));
387   PetscCall(VecGetKokkosView(yy,&yv));
388   PetscCall(VecGetKokkosViewWrite(zz,&zv));
389   if (zz != yy) Kokkos::deep_copy(zv,yv);
390   if (A->form_explicit_transpose) {
391     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
392     mode = "N";
393   } else {
394     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
395     csrmat = &aijkok->csrmat;
396     mode = "T";
397   }
398   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
399   PetscCall(VecRestoreKokkosView(xx,&xv));
400   PetscCall(VecRestoreKokkosView(yy,&yv));
401   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
402   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
403   PetscCall(PetscLogGpuTimeEnd());
404   PetscFunctionReturn(0);
405 }
406 
407 /* z = A^H x + y */
408 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
409 {
410   Mat_SeqAIJKokkos                 *aijkok;
411   const char                       *mode;
412   ConstPetscScalarKokkosView       xv,yv;
413   PetscScalarKokkosView            zv;
414   KokkosCsrMatrix                  *csrmat;
415 
416   PetscFunctionBegin;
417   PetscCall(PetscLogGpuTimeBegin());
418   PetscCall(MatSeqAIJKokkosSyncDevice(A));
419   PetscCall(VecGetKokkosView(xx,&xv));
420   PetscCall(VecGetKokkosView(yy,&yv));
421   PetscCall(VecGetKokkosViewWrite(zz,&zv));
422   if (zz != yy) Kokkos::deep_copy(zv,yv);
423   if (A->form_explicit_transpose) {
424     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
425     mode = "N";
426   } else {
427     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
428     csrmat = &aijkok->csrmat;
429     mode = "C";
430   }
431   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
432   PetscCall(VecRestoreKokkosView(xx,&xv));
433   PetscCall(VecRestoreKokkosView(yy,&yv));
434   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
435   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
436   PetscCall(PetscLogGpuTimeEnd());
437   PetscFunctionReturn(0);
438 }
439 
440 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
441 {
442   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
443 
444   PetscFunctionBegin;
445   switch (op) {
446     case MAT_FORM_EXPLICIT_TRANSPOSE:
447       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
448       if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
449       A->form_explicit_transpose = flg;
450       break;
451     default:
452       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
453       break;
454   }
455   PetscFunctionReturn(0);
456 }
457 
458 /* Depending on reuse, either build a new mat, or use the existing mat */
459 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
460 {
461   Mat_SeqAIJ       *aseq;
462 
463   PetscFunctionBegin;
464   PetscCall(PetscKokkosInitializeCheck());
465   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
466     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
467   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
468     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
469   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
470     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
471     PetscCall(PetscFree(A->defaultvectype));
472     PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
473     PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
474     PetscCall(MatSetOps_SeqAIJKokkos(A));
475     aseq = static_cast<Mat_SeqAIJ*>(A->data);
476     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
477       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
478       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
479     }
480   }
481   PetscFunctionReturn(0);
482 }
483 
484 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
485    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
486  */
487 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
488 {
489   Mat_SeqAIJ            *bseq;
490   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
491   Mat                   mat;
492 
493   PetscFunctionBegin;
494   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
495   PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
496   mat  = *B;
497   if (A->assembled) {
498     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
499     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
500     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
501     /* Now copy values to B if needed */
502     if (dupOption == MAT_COPY_VALUES) {
503       if (akok->a_dual.need_sync_device()) {
504         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
505         bkok->a_dual.modify_host();
506       } else { /* If device has the latest data, we only copy data on device */
507         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
508         bkok->a_dual.modify_device();
509       }
510     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
511       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
512       bkok->a_dual.modify_host();
513     }
514     mat->spptr = bkok;
515   }
516 
517   PetscCall(PetscFree(mat->defaultvectype));
518   PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
519   PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
520   PetscCall(MatSetOps_SeqAIJKokkos(mat));
521   PetscFunctionReturn(0);
522 }
523 
524 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
525 {
526   Mat               At;
527   KokkosCsrMatrix   *internT;
528   Mat_SeqAIJKokkos  *atkok,*bkok;
529 
530   PetscFunctionBegin;
531   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A,*B));
532   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
533   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
534     /* Deep copy internT, as we want to isolate the internal transpose */
535     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
536     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
537     if (reuse == MAT_INITIAL_MATRIX) *B = At;
538     else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
539   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
540     if ((*B)->assembled) {
541       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
542       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
543       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
544     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
545       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
546       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
547       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
548       PetscCallCXX(Kokkos::deep_copy(a_h,internT->values));
549       PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
550     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
551   }
552   PetscFunctionReturn(0);
553 }
554 
555 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
556 {
557   Mat_SeqAIJKokkos           *aijkok;
558 
559   PetscFunctionBegin;
560   if (A->factortype == MAT_FACTOR_NONE) {
561     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
562     delete aijkok;
563   } else {
564     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
565   }
566   A->spptr = NULL;
567   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
568   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
569   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
570   PetscCall(MatDestroy_SeqAIJ(A));
571   PetscFunctionReturn(0);
572 }
573 
574 /*MC
575    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
576 
577    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
578 
579    Options Database Keys:
580 .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
581 
582   Level: beginner
583 
584 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
585 M*/
586 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
587 {
588   PetscFunctionBegin;
589   PetscCall(PetscKokkosInitializeCheck());
590   PetscCall(MatCreate_SeqAIJ(A));
591   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
592   PetscFunctionReturn(0);
593 }
594 
595 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
596 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
597 {
598   Mat_SeqAIJ                   *a,*b;
599   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
600   MatScalarKokkosView          aa,ba,ca;
601   MatRowMapKokkosView          ai,bi,ci;
602   MatColIdxKokkosView          aj,bj,cj;
603   PetscInt                     m,n,nnz,aN;
604 
605   PetscFunctionBegin;
606   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
607   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
608   PetscValidPointer(C,4);
609   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
610   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
611   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
612   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
613 
614   PetscCall(MatSeqAIJKokkosSyncDevice(A));
615   PetscCall(MatSeqAIJKokkosSyncDevice(B));
616   a    = static_cast<Mat_SeqAIJ*>(A->data);
617   b    = static_cast<Mat_SeqAIJ*>(B->data);
618   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
619   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
620   aa   = akok->a_dual.view_device();
621   ai   = akok->i_dual.view_device();
622   ba   = bkok->a_dual.view_device();
623   bi   = bkok->i_dual.view_device();
624   m    = A->rmap->n; /* M, N and nnz of C */
625   n    = A->cmap->n + B->cmap->n;
626   nnz  = a->nz + b->nz;
627   aN   = A->cmap->n; /* N of A */
628   if (reuse == MAT_INITIAL_MATRIX) {
629     aj = akok->j_dual.view_device();
630     bj = bkok->j_dual.view_device();
631     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
632     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
633     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
634     ca = ca_dual.view_device();
635     ci = ci_dual.view_device();
636     cj = cj_dual.view_device();
637 
638     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
639     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
640       PetscInt i = t.league_rank(); /* row i */
641       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
642 
643       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
644         ci(i) = coffset;
645         if (i == m-1) ci(m) = ai(m) + bi(m);
646       });
647 
648       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
649         if (k < alen) {
650           ca(coffset+k) = aa(ai(i)+k);
651           cj(coffset+k) = aj(ai(i)+k);
652         } else {
653           ca(coffset+k) = ba(bi(i)+k-alen);
654           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
655         }
656       });
657     });
658     ca_dual.modify_device();
659     ci_dual.modify_device();
660     cj_dual.modify_device();
661     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
662     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
663   } else if (reuse == MAT_REUSE_MATRIX) {
664     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
665     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
666     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
667     ca   = ckok->a_dual.view_device();
668     ci   = ckok->i_dual.view_device();
669 
670     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
671       PetscInt i = t.league_rank(); /* row i */
672       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
673       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
674         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
675         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
676       });
677     });
678     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
679   }
680   PetscFunctionReturn(0);
681 }
682 
683 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
684 {
685   PetscFunctionBegin;
686   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
687   PetscFunctionReturn(0);
688 }
689 
690 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
691 {
692   Mat_Product                    *product = C->product;
693   Mat                            A,B;
694   bool                           transA,transB; /* use bool, since KK needs this type */
695   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
696   Mat_SeqAIJ                     *c;
697   MatProductData_SeqAIJKokkos    *pdata;
698   KokkosCsrMatrix                *csrmatA,*csrmatB;
699 
700   PetscFunctionBegin;
701   MatCheckProduct(C,1);
702   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
703   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
704 
705   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
706     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
707     PetscFunctionReturn(0);
708   }
709 
710   switch (product->type) {
711     case MATPRODUCT_AB:  transA = false; transB = false; break;
712     case MATPRODUCT_AtB: transA = true;  transB = false; break;
713     case MATPRODUCT_ABt: transA = false; transB = true;  break;
714     default:
715       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
716   }
717 
718   A     = product->A;
719   B     = product->B;
720   PetscCall(MatSeqAIJKokkosSyncDevice(A));
721   PetscCall(MatSeqAIJKokkosSyncDevice(B));
722   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
723   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
724   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
725 
726   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
727 
728   csrmatA = &akok->csrmat;
729   csrmatB = &bkok->csrmat;
730 
731   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
732   if (transA) {
733     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
734     transA = false;
735   }
736 
737   if (transB) {
738     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
739     transB = false;
740   }
741   PetscCall(PetscLogGpuTimeBegin());
742   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
743   PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
744   PetscCall(PetscLogGpuTimeEnd());
745   PetscCall(MatSeqAIJKokkosModifyDevice(C));
746   /* shorter version of MatAssemblyEnd_SeqAIJ */
747   c = (Mat_SeqAIJ*)C->data;
748   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
749   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
750   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
751   c->reallocs         = 0;
752   C->info.mallocs     = 0;
753   C->info.nz_unneeded = 0;
754   C->assembled        = C->was_assembled = PETSC_TRUE;
755   C->num_ass++;
756   PetscFunctionReturn(0);
757 }
758 
759 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
760 {
761   Mat_Product                    *product = C->product;
762   MatProductType                 ptype;
763   Mat                            A,B;
764   bool                           transA,transB;
765   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
766   MatProductData_SeqAIJKokkos    *pdata;
767   MPI_Comm                       comm;
768   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
769 
770   PetscFunctionBegin;
771   MatCheckProduct(C,1);
772   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
773   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
774   A       = product->A;
775   B       = product->B;
776   PetscCall(MatSeqAIJKokkosSyncDevice(A));
777   PetscCall(MatSeqAIJKokkosSyncDevice(B));
778   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
779   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
780   csrmatA = &akok->csrmat;
781   csrmatB = &bkok->csrmat;
782 
783   ptype   = product->type;
784   switch (ptype) {
785     case MATPRODUCT_AB:  transA = false; transB = false; break;
786     case MATPRODUCT_AtB: transA = true;  transB = false; break;
787     case MATPRODUCT_ABt: transA = false; transB = true;  break;
788     default:
789       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
790   }
791 
792   product->data = pdata = new MatProductData_SeqAIJKokkos();
793   pdata->kh.set_team_work_size(16);
794   pdata->kh.set_dynamic_scheduling(true);
795   pdata->reusesym = product->api_user;
796 
797   /* TODO: add command line options to select spgemm algorithms */
798   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
799   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
800      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
801    */
802   pdata->kh.create_spgemm_handle(spgemm_alg);
803 
804   PetscCall(PetscLogGpuTimeBegin());
805   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
806   if (transA) {
807     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
808     transA = false;
809   }
810 
811   if (transB) {
812     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
813     transB = false;
814   }
815 
816   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
817   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
818     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
819     calling new Mat_SeqAIJKokkos().
820     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
821   */
822   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
823   PetscCallCXX(sort_crs_matrix(csrmatC));
824   PetscCall(PetscLogGpuTimeEnd());
825 
826   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
827   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
828   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
829   PetscFunctionReturn(0);
830 }
831 
832 /* handles sparse matrix matrix ops */
833 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
834 {
835   Mat_Product    *product = mat->product;
836   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
837 
838   PetscFunctionBegin;
839   MatCheckProduct(mat,1);
840   PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
841   if (product->type == MATPRODUCT_ABC) {
842     PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
843   }
844   if (Biskok && Ciskok) {
845     switch (product->type) {
846     case MATPRODUCT_AB:
847     case MATPRODUCT_AtB:
848     case MATPRODUCT_ABt:
849       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
850       break;
851     case MATPRODUCT_PtAP:
852     case MATPRODUCT_RARt:
853     case MATPRODUCT_ABC:
854       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
855       break;
856     default:
857       break;
858     }
859   } else { /* fallback for AIJ */
860     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
861   }
862   PetscFunctionReturn(0);
863 }
864 
865 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
866 {
867   Mat_SeqAIJKokkos *aijkok;
868 
869   PetscFunctionBegin;
870   PetscCall(PetscLogGpuTimeBegin());
871   PetscCall(MatSeqAIJKokkosSyncDevice(A));
872   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
873   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
874   PetscCall(MatSeqAIJKokkosModifyDevice(A));
875   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
876   PetscCall(PetscLogGpuTimeEnd());
877   PetscFunctionReturn(0);
878 }
879 
880 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
881 {
882   Mat_SeqAIJKokkos *aijkok;
883 
884   PetscFunctionBegin;
885   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
886   if (aijkok) { /* Only zero the device if data is already there */
887     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
888     PetscCall(MatSeqAIJKokkosModifyDevice(A));
889   } else { /* Might be preallocated but not assembled */
890     PetscCall(MatZeroEntries_SeqAIJ(A));
891   }
892   PetscFunctionReturn(0);
893 }
894 
895 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A,Vec x)
896 {
897   Mat_SeqAIJ                   *aijseq;
898   Mat_SeqAIJKokkos             *aijkok;
899   PetscInt                     n;
900   PetscScalarKokkosView        xv;
901 
902   PetscFunctionBegin;
903   PetscCall(VecGetLocalSize(x,&n));
904   PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
905   PetscCheck(A->factortype == MAT_FACTOR_NONE,PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
906 
907   PetscCall(MatSeqAIJKokkosSyncDevice(A));
908   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
909 
910   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
911     PetscCall(MatMarkDiagonal_SeqAIJ(A));
912     aijseq = static_cast<Mat_SeqAIJ*>(A->data);
913     aijkok->SetDiagonal(aijseq->diag);
914   }
915 
916   const auto& Aa = aijkok->a_dual.view_device();
917   const auto& Ai = aijkok->i_dual.view_device();
918   const auto& Adiag = aijkok->diag_dual.view_device();
919 
920   PetscCall(VecGetKokkosViewWrite(x,&xv));
921   Kokkos::parallel_for(n,KOKKOS_LAMBDA(const PetscInt i) {
922     if (Adiag(i) < Ai(i+1)) xv(i) = Aa(Adiag(i));
923     else xv(i) = 0;
924   });
925   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
926   PetscFunctionReturn(0);
927 }
928 
929 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
930 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
931 {
932   Mat_SeqAIJKokkos   *aijkok;
933 
934   PetscFunctionBegin;
935   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
936   PetscValidPointer(kv,2);
937   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
938   PetscCall(MatSeqAIJKokkosSyncDevice(A));
939   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
940   *kv    = aijkok->a_dual.view_device();
941   PetscFunctionReturn(0);
942 }
943 
944 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
945 {
946   PetscFunctionBegin;
947   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
948   PetscValidPointer(kv,2);
949   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
950   PetscFunctionReturn(0);
951 }
952 
953 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
954 {
955   Mat_SeqAIJKokkos   *aijkok;
956 
957   PetscFunctionBegin;
958   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
959   PetscValidPointer(kv,2);
960   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
961   PetscCall(MatSeqAIJKokkosSyncDevice(A));
962   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
963   *kv    = aijkok->a_dual.view_device();
964   PetscFunctionReturn(0);
965 }
966 
967 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
968 {
969   PetscFunctionBegin;
970   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
971   PetscValidPointer(kv,2);
972   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
973   PetscCall(MatSeqAIJKokkosModifyDevice(A));
974   PetscFunctionReturn(0);
975 }
976 
977 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
978 {
979   Mat_SeqAIJKokkos   *aijkok;
980 
981   PetscFunctionBegin;
982   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
983   PetscValidPointer(kv,2);
984   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
985   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
986   *kv    = aijkok->a_dual.view_device();
987   PetscFunctionReturn(0);
988 }
989 
990 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
991 {
992   PetscFunctionBegin;
993   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
994   PetscValidPointer(kv,2);
995   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
996   PetscCall(MatSeqAIJKokkosModifyDevice(A));
997   PetscFunctionReturn(0);
998 }
999 
1000 /* Computes Y += alpha X */
1001 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
1002 {
1003   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
1004   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
1005   ConstMatScalarKokkosView   Xa;
1006   MatScalarKokkosView        Ya;
1007 
1008   PetscFunctionBegin;
1009   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
1010   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
1011   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1012   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1013   PetscCall(PetscLogGpuTimeBegin());
1014 
1015   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1016     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
1017     PetscBool e;
1018     PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
1019     if (e) {
1020       PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e));
1021       if (e) pattern = SAME_NONZERO_PATTERN;
1022     }
1023   }
1024 
1025   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1026     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1027     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1028     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1029   */
1030   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
1031   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
1032   Xa   = xkok->a_dual.view_device();
1033   Ya   = ykok->a_dual.view_device();
1034 
1035   if (pattern == SAME_NONZERO_PATTERN) {
1036     KokkosBlas::axpy(alpha,Xa,Ya);
1037     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1038   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1039     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
1040     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
1041 
1042     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1043       PetscInt i = t.league_rank(); /* row i */
1044       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1045         PetscInt p,q = Yi(i);
1046         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1047           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1048           if (Xj(p) == Yj(q)) { /* Found it */
1049             Ya(q) += alpha * Xa(p);
1050             q++;
1051           } else {
1052             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1053                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1054             */
1055             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) =
1056            #if PETSC_PKG_KOKKOS_VERSION_GE(3,6,99)
1057               Kokkos::nan("1"); /* auto promote the double NaN if needed */
1058            #else
1059               Kokkos::Experimental::nan("1");
1060            #endif
1061           }
1062         }
1063       });
1064     });
1065     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1066   } else { /* different nonzero patterns */
1067     Mat             Z;
1068     KokkosCsrMatrix zcsr;
1069     KernelHandle    kh;
1070     kh.create_spadd_handle(false);
1071     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1072     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1073     zkok = new Mat_SeqAIJKokkos(zcsr);
1074     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
1075     PetscCall(MatHeaderReplace(Y,&Z));
1076     kh.destroy_spadd_handle();
1077   }
1078   PetscCall(PetscLogGpuTimeEnd());
1079   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1080   PetscFunctionReturn(0);
1081 }
1082 
1083 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1084 {
1085   Mat_SeqAIJKokkos *akok;
1086   Mat_SeqAIJ       *aseq;
1087 
1088   PetscFunctionBegin;
1089   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1090   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1091   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1092   delete akok;
1093   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
1094   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1095   akok->SetUpCOO(aseq);
1096   PetscFunctionReturn(0);
1097 }
1098 
1099 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1100 {
1101   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1102   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1103   PetscCount                  Annz = aseq->nz;
1104   const PetscCountKokkosView& jmap = akok->jmap_d;
1105   const PetscCountKokkosView& perm = akok->perm_d;
1106   MatScalarKokkosView         Aa;
1107   ConstMatScalarKokkosView    kv;
1108   PetscMemType                memtype;
1109 
1110   PetscFunctionBegin;
1111   PetscCall(PetscGetMemType(v,&memtype));
1112   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1113     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1114   } else {
1115     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1116   }
1117 
1118   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1119   else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
1120 
1121   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1122     PetscScalar sum = 0.0;
1123     for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k));
1124     Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum;
1125   });
1126 
1127   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1128   else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa));
1129   PetscFunctionReturn(0);
1130 }
1131 
1132 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag)
1133 {
1134   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1135   MatScalarKokkosView         Aa;
1136   const MatRowMapKokkosView&  Ai = akok->i_dual.view_device();
1137   PetscInt                    m = A->rmap->n;
1138   ConstMatRowMapKokkosView    Adiag(diag,m); /* diag is a device pointer */
1139 
1140   PetscFunctionBegin;
1141   PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa));
1142   Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) {
1143     PetscScalar tmp;
1144     if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */
1145       tmp          = Aa(Ai(i));
1146       Aa(Ai(i))    = Aa(Adiag(i));
1147       Aa(Adiag(i)) = tmp;
1148     }
1149   });
1150   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1151   PetscFunctionReturn(0);
1152 }
1153 
1154 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1155 {
1156   PetscFunctionBegin;
1157   PetscCall(MatSeqAIJKokkosSyncHost(A));
1158   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
1159   B->offloadmask = PETSC_OFFLOAD_CPU;
1160   PetscFunctionReturn(0);
1161 }
1162 
1163 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1164 {
1165   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1166 
1167   PetscFunctionBegin;
1168   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1169   A->boundtocpu  = PETSC_FALSE;
1170 
1171   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1172   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1173   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1174   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1175   A->ops->scale                     = MatScale_SeqAIJKokkos;
1176   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1177   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1178   A->ops->mult                      = MatMult_SeqAIJKokkos;
1179   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1180   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1181   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1182   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1183   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1184   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1185   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1186   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1187   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1188   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1189   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1190   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1191   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1192   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1193   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1194   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1195 
1196   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
1197   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1198   PetscFunctionReturn(0);
1199 }
1200 
1201 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1202 {
1203   Mat_SeqAIJ *aseq;
1204   PetscInt    i,m,n;
1205 
1206   PetscFunctionBegin;
1207   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1208 
1209   m = akok->nrows();
1210   n = akok->ncols();
1211   PetscCall(MatSetSizes(A,m,n,m,n));
1212   PetscCall(MatSetType(A,MATSEQAIJKOKKOS));
1213 
1214   /* Set up data structures of A as a MATSEQAIJ */
1215   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1216   aseq = (Mat_SeqAIJ*)(A)->data;
1217 
1218   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1219   akok->j_dual.sync_host();
1220 
1221   aseq->i            = akok->i_host_data();
1222   aseq->j            = akok->j_host_data();
1223   aseq->a            = akok->a_host_data();
1224   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1225   aseq->singlemalloc = PETSC_FALSE;
1226   aseq->free_a       = PETSC_FALSE;
1227   aseq->free_ij      = PETSC_FALSE;
1228   aseq->nz           = akok->nnz();
1229   aseq->maxnz        = aseq->nz;
1230 
1231   PetscCall(PetscMalloc1(m,&aseq->imax));
1232   PetscCall(PetscMalloc1(m,&aseq->ilen));
1233   for (i=0; i<m; i++) {
1234     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1235   }
1236 
1237   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1238   akok->nonzerostate = A->nonzerostate;
1239   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1240   PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
1241   PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1242   PetscFunctionReturn(0);
1243 }
1244 
1245 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1246 
1247    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1248  */
1249 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1250 {
1251   PetscFunctionBegin;
1252   PetscCall(MatCreate(comm,A));
1253   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
1254   PetscFunctionReturn(0);
1255 }
1256 
1257 /* --------------------------------------------------------------------------------*/
1258 /*@C
1259    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1260    (the default parallel PETSc format). This matrix will ultimately be handled by
1261    Kokkos for calculations. For good matrix
1262    assembly performance the user should preallocate the matrix storage by setting
1263    the parameter nz (or the array nnz).  By setting these parameters accurately,
1264    performance during matrix assembly can be increased by more than a factor of 50.
1265 
1266    Collective
1267 
1268    Input Parameters:
1269 +  comm - MPI communicator, set to PETSC_COMM_SELF
1270 .  m - number of rows
1271 .  n - number of columns
1272 .  nz - number of nonzeros per row (same for all rows)
1273 -  nnz - array containing the number of nonzeros in the various rows
1274          (possibly different for each row) or NULL
1275 
1276    Output Parameter:
1277 .  A - the matrix
1278 
1279    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1280    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1281    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1282 
1283    Notes:
1284    If nnz is given then nz is ignored
1285 
1286    The AIJ format (also called the Yale sparse matrix format or
1287    compressed row storage), is fully compatible with standard Fortran 77
1288    storage.  That is, the stored row and column indices can begin at
1289    either one (as in Fortran) or zero.  See the users' manual for details.
1290 
1291    Specify the preallocated storage with either nz or nnz (not both).
1292    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1293    allocation.  For large problems you MUST preallocate memory or you
1294    will get TERRIBLE performance, see the users' manual chapter on matrices.
1295 
1296    By default, this format uses inodes (identical nodes) when possible, to
1297    improve numerical efficiency of matrix-vector products and solves. We
1298    search for consecutive rows with the same nonzero structure, thereby
1299    reusing matrix information to achieve increased efficiency.
1300 
1301    Level: intermediate
1302 
1303 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1304 @*/
1305 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1306 {
1307   PetscFunctionBegin;
1308   PetscCall(PetscKokkosInitializeCheck());
1309   PetscCall(MatCreate(comm,A));
1310   PetscCall(MatSetSizes(*A,m,n,m,n));
1311   PetscCall(MatSetType(*A,MATSEQAIJKOKKOS));
1312   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
1313   PetscFunctionReturn(0);
1314 }
1315 
1316 typedef Kokkos::TeamPolicy<>::member_type team_member;
1317 //
1318 // This factorization exploits block diagonal matrices with "Nf" (not used).
1319 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1320 //
1321 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1322 {
1323   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1324   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1325   IS                 isrow = b->row,isicol = b->icol;
1326   const PetscInt     *r_h,*ic_h;
1327   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1328   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1329   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1330   PetscBool          row_identity,col_identity;
1331   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1332 
1333   PetscFunctionBegin;
1334   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1335   PetscCall(MatIsStructurallySymmetric(A,&row_identity));
1336   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1337   PetscCall(ISGetIndices(isrow,&r_h));
1338   PetscCall(ISGetIndices(isicol,&ic_h));
1339   PetscCall(ISGetSize(isicol,&nc));
1340   PetscCall(PetscLogGpuTimeBegin());
1341   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1342   {
1343 #define KOKKOS_SHARED_LEVEL 1
1344     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1345     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1346     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1347     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1348     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1349     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1350     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1351     size_t flops_h = 0.0;
1352     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1353     Kokkos::View<size_t> d_flops_k ("flops");
1354     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1355     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1356     Kokkos::deep_copy (d_flops_k, h_flops_k);
1357     Kokkos::deep_copy (d_r_k, h_r_k);
1358     Kokkos::deep_copy (d_ic_k, h_ic_k);
1359     // Fill A --> fact
1360     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1361         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1362         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1363         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1364         // zero rows of B
1365         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1366             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1367             PetscScalar *baL = ba_d + bi_d[rowb];
1368             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1369             /* zero (unfactored row) */
1370             for (int j=0;j<nzbL;j++) baL[j] = 0;
1371             for (int j=0;j<nzbU;j++) baU[j] = 0;
1372           });
1373         // copy A into B
1374         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1375             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1376             const PetscScalar *av    = aa_d + ai_d[rowa];
1377             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1378             /* load in initial (unfactored row) */
1379             for (int j=0;j<nza;j++) {
1380               PetscInt    colb = ic[ajtmp[j]];
1381               PetscScalar vala = av[j];
1382               if (colb == rowb) {
1383                 *(ba_d + bdiag_d[rowb]) = vala;
1384               } else {
1385                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1386                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1387                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1388                 for (int j=0; j<nz ; j++) {
1389                   if (pbj[j] == colb) {
1390                     pba[j] = vala;
1391                     set++;
1392                     break;
1393                   }
1394                 }
1395                #if !defined(PETSC_HAVE_SYCL)
1396                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1397                #endif
1398               }
1399             }
1400           });
1401       });
1402     Kokkos::fence();
1403 
1404     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1405         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1406         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1407         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1408         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1409         const PetscInt  start = field*nloc, end = start + nloc;
1410         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1411         // A22 panel update for each row A(1,:) and col A(:,1)
1412         for (int ii=start; ii<end-1; ii++) {
1413           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1414           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1415           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1416           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1417           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1418               PetscInt kIdx = j*Ni + field_block_idx;
1419               if (kIdx >= nzUi) /* void */ ;
1420               else {
1421                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1422                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1423                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1424                 size_t         st_idx;
1425                 // find and do L(k,i) = A(:k,i) / A(i,i)
1426                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1427                 // get column, there has got to be a better way
1428                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1429                     if (pjL[j] == ii) {
1430                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1431                       idx = j; // output
1432                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1433                     }
1434                 }, st_idx);
1435                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1436 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1437                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1438 #endif
1439                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1440                 // U(i+1,:end)
1441                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1442                       PetscScalar Uij = baUi[uiIdx];
1443                       PetscInt    col = bjUi[uiIdx];
1444                       if (col==myk) {
1445                         // A_kk = A_kk - L_ki * U_ij(k)
1446                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1447                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1448                       } else {
1449                         PetscScalar    *start, *end, *pAkjv=NULL;
1450                         PetscInt       high, low;
1451                         const PetscInt *startj;
1452                         if (col<myk) { // L
1453                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1454                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1455                           start = pLki+1; // start at pLki+1, A22(myk,1)
1456                           startj= bj_d + bi_d[myk] + idx;
1457                           end   = ba_d + bi_d[myk+1];
1458                         } else {
1459                           PetscInt idx = bdiag_d[myk+1]+1;
1460                           start = ba_d + idx;
1461                           startj= bj_d + idx;
1462                           end   = ba_d + bdiag_d[myk];
1463                         }
1464                         // search for 'col', use bisection search - TODO
1465                         low  = 0;
1466                         high = (PetscInt)(end-start);
1467                         while (high-low > 5) {
1468                           int t = (low+high)/2;
1469                           if (startj[t] > col) high = t;
1470                           else                 low  = t;
1471                         }
1472                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1473                           if (startj[pAkjv-start] == col) break;
1474                         }
1475 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1476                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1477 #endif
1478                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1479                       }
1480                     });
1481               }
1482             });
1483           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1484           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1485         } /* endof for (i=0; i<n; i++) { */
1486         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1487       });
1488     Kokkos::fence();
1489     Kokkos::deep_copy (h_flops_k, d_flops_k);
1490     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1491     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1492         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1493         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1494         /* Invert diagonal for simpler triangular solves */
1495         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1496             int i = start + outer_index*Ni + lg_rank%Ni;
1497             if (i < end) {
1498               PetscScalar *pv = ba_d + bdiag_d[i];
1499               *pv = 1.0/(*pv);
1500             }
1501           });
1502       });
1503   }
1504   PetscCall(PetscLogGpuTimeEnd());
1505   PetscCall(ISRestoreIndices(isicol,&ic_h));
1506   PetscCall(ISRestoreIndices(isrow,&r_h));
1507 
1508   PetscCall(ISIdentity(isrow,&row_identity));
1509   PetscCall(ISIdentity(isicol,&col_identity));
1510   if (b->inode.size) {
1511     B->ops->solve = MatSolve_SeqAIJ_Inode;
1512   } else if (row_identity && col_identity) {
1513     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1514   } else {
1515     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1516   }
1517   B->offloadmask = PETSC_OFFLOAD_GPU;
1518   PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
1519   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1520   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1521   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1522   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1523   B->assembled              = PETSC_TRUE;
1524   B->preallocated           = PETSC_TRUE;
1525 
1526   PetscFunctionReturn(0);
1527 }
1528 
1529 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1530 {
1531   PetscFunctionBegin;
1532   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1533   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1534   PetscFunctionReturn(0);
1535 }
1536 
1537 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1538 {
1539   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1540 
1541   PetscFunctionBegin;
1542   if (!factors->sptrsv_symbolic_completed) {
1543     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1544     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1545     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1546   }
1547   PetscFunctionReturn(0);
1548 }
1549 
1550 /* Check if we need to update factors etc for transpose solve */
1551 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1552 {
1553   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1554   MatColIdxType             n = A->rmap->n;
1555 
1556   PetscFunctionBegin;
1557   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1558     /* Update L^T and do sptrsv symbolic */
1559     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1560     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1561     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1562     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1563 
1564     transpose_matrix<
1565       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1566       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1567       MatRowMapKokkosView,DefaultExecutionSpace>(
1568         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1569         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1570 
1571     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1572       We have to sort the indices, until KK provides finer control options.
1573     */
1574     sort_crs_matrix<DefaultExecutionSpace,
1575       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1576         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1577 
1578     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1579 
1580     /* Update U^T and do sptrsv symbolic */
1581     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1582     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1583     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1584     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1585 
1586     transpose_matrix<
1587       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1588       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1589       MatRowMapKokkosView,DefaultExecutionSpace>(
1590         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1591         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1592 
1593     /* Sort indices. See comments above */
1594     sort_crs_matrix<DefaultExecutionSpace,
1595       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1596         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1597 
1598     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1599     factors->transpose_updated = PETSC_TRUE;
1600   }
1601   PetscFunctionReturn(0);
1602 }
1603 
1604 /* Solve Ax = b, with A = LU */
1605 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1606 {
1607   ConstPetscScalarKokkosView     bv;
1608   PetscScalarKokkosView          xv;
1609   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1610 
1611   PetscFunctionBegin;
1612   PetscCall(PetscLogGpuTimeBegin());
1613   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1614   PetscCall(VecGetKokkosView(b,&bv));
1615   PetscCall(VecGetKokkosViewWrite(x,&xv));
1616   /* Solve L tmpv = b */
1617   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1618   /* Solve Ux = tmpv */
1619   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1620   PetscCall(VecRestoreKokkosView(b,&bv));
1621   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1622   PetscCall(PetscLogGpuTimeEnd());
1623   PetscFunctionReturn(0);
1624 }
1625 
1626 /* Solve A^T x = b, where A^T = U^T L^T */
1627 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1628 {
1629   ConstPetscScalarKokkosView     bv;
1630   PetscScalarKokkosView          xv;
1631   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1632 
1633   PetscFunctionBegin;
1634   PetscCall(PetscLogGpuTimeBegin());
1635   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1636   PetscCall(VecGetKokkosView(b,&bv));
1637   PetscCall(VecGetKokkosViewWrite(x,&xv));
1638   /* Solve U^T tmpv = b */
1639   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1640 
1641   /* Solve L^T x = tmpv */
1642   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1643   PetscCall(VecRestoreKokkosView(b,&bv));
1644   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1645   PetscCall(PetscLogGpuTimeEnd());
1646   PetscFunctionReturn(0);
1647 }
1648 
1649 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1650 {
1651   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1652   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1653   PetscInt                       fill_lev = info->levels;
1654 
1655   PetscFunctionBegin;
1656   PetscCall(PetscLogGpuTimeBegin());
1657   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1658 
1659   auto a_d = aijkok->a_dual.view_device();
1660   auto i_d = aijkok->i_dual.view_device();
1661   auto j_d = aijkok->j_dual.view_device();
1662 
1663   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1664 
1665   B->assembled                       = PETSC_TRUE;
1666   B->preallocated                    = PETSC_TRUE;
1667   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1668   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1669   B->ops->matsolve                   = NULL;
1670   B->ops->matsolvetranspose          = NULL;
1671   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1672 
1673   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1674   factors->transpose_updated         = PETSC_FALSE;
1675   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1676   /* TODO: log flops, but how to know that? */
1677   PetscCall(PetscLogGpuTimeEnd());
1678   PetscFunctionReturn(0);
1679 }
1680 
1681 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1682 {
1683   Mat_SeqAIJKokkos               *aijkok;
1684   Mat_SeqAIJ                     *b;
1685   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1686   PetscInt                       fill_lev = info->levels;
1687   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1688   PetscInt                       n = A->rmap->n;
1689 
1690   PetscFunctionBegin;
1691   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1692   /* Rebuild factors */
1693   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1694   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1695 
1696   /* Create a spiluk handle and then do symbolic factorization */
1697   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1698   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1699 
1700   auto spiluk_handle = factors->kh.get_spiluk_handle();
1701 
1702   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1703   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1704   Kokkos::realloc(factors->iU_d,n+1);
1705   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1706 
1707   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1708   auto i_d = aijkok->i_dual.view_device();
1709   auto j_d = aijkok->j_dual.view_device();
1710   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1711   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1712 
1713   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1714   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1715   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1716   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1717 
1718   /* TODO: add options to select sptrsv algorithms */
1719   /* Create sptrsv handles for L, U and their transpose */
1720  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1721   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1722  #else
1723   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1724  #endif
1725 
1726   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1727   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1728   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1729   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1730 
1731   /* Fill fields of the factor matrix B */
1732   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
1733   b     = (Mat_SeqAIJ*)B->data;
1734   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1735   B->info.fill_ratio_given  = info->fill;
1736   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1737 
1738   B->offloadmask            = PETSC_OFFLOAD_GPU;
1739   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1740   PetscFunctionReturn(0);
1741 }
1742 
1743 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1744 {
1745   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1746   const PetscInt   nrows   = A->rmap->n;
1747 
1748   PetscFunctionBegin;
1749   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1750   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1751   // move B data into Kokkos
1752   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1753   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1754   {
1755     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1756     if (!baijkok->diag_d.extent(0)) {
1757       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1758       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1759       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1760     }
1761   }
1762   PetscFunctionReturn(0);
1763 }
1764 
1765 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1766 {
1767   PetscFunctionBegin;
1768   *type = MATSOLVERKOKKOS;
1769   PetscFunctionReturn(0);
1770 }
1771 
1772 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1773 {
1774   PetscFunctionBegin;
1775   *type = MATSOLVERKOKKOSDEVICE;
1776   PetscFunctionReturn(0);
1777 }
1778 
1779 /*MC
1780   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1781   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1782 
1783   Level: beginner
1784 
1785 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1786 M*/
1787 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1788 {
1789   PetscInt       n = A->rmap->n;
1790 
1791   PetscFunctionBegin;
1792   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1793   PetscCall(MatSetSizes(*B,n,n,n,n));
1794   (*B)->factortype = ftype;
1795   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1796   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1797 
1798   if (ftype == MAT_FACTOR_LU) {
1799     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1800     (*B)->canuseordering         = PETSC_TRUE;
1801     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1802   } else if (ftype == MAT_FACTOR_ILU) {
1803     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1804     (*B)->canuseordering         = PETSC_FALSE;
1805     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1806   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1807 
1808   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1809   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1810   PetscFunctionReturn(0);
1811 }
1812 
1813 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1814 {
1815   PetscInt       n = A->rmap->n;
1816 
1817   PetscFunctionBegin;
1818   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1819   PetscCall(MatSetSizes(*B,n,n,n,n));
1820   (*B)->factortype = ftype;
1821   (*B)->canuseordering = PETSC_TRUE;
1822   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1823   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1824 
1825   if (ftype == MAT_FACTOR_LU) {
1826     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1827     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1828   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1829 
1830   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1831   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
1832   PetscFunctionReturn(0);
1833 }
1834 
1835 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1836 {
1837   PetscFunctionBegin;
1838   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
1839   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
1840   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
1841   PetscFunctionReturn(0);
1842 }
1843 
1844 /* Utility to print out a KokkosCsrMatrix for debugging */
1845 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1846 {
1847   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1848   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1849   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1850   const PetscInt    *i = iv.data();
1851   const PetscInt    *j = jv.data();
1852   const PetscScalar *a = av.data();
1853   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1854 
1855   PetscFunctionBegin;
1856   PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1857   for (PetscInt k=0; k<m; k++) {
1858     PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1859     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1860       PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p])));
1861     }
1862     PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n"));
1863   }
1864   PetscFunctionReturn(0);
1865 }
1866