xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 2b1bc0da69ff167ceb1690a39a6deaf9d55866d7)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petscsystypes.h>
5 #include <petscerror.h>
6 
7 #include <Kokkos_Core.hpp>
8 #include <KokkosBlas.hpp>
9 #include <KokkosKernels_Sorting.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/aij.h>
18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   PetscErrorCode    ierr;
29   Mat_SeqAIJ        *aijseq;
30   Mat_SeqAIJKokkos  *aijkok;
31 
32   PetscFunctionBegin;
33   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
34   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35 
36   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38 
39   /* If aijkok does not exist, we just copy i, j to device.
40      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41      In both cases, we build a new aijkok structure.
42   */
43   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44     delete aijkok;
45     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46     A->spptr = aijkok;
47   }
48 
49   if (aijkok && aijkok->device_mat_d.data()) {
50     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51   }
52   PetscFunctionReturn(0);
53 }
54 
55 /* Sync CSR data to device if not yet */
56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
57 {
58   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
59 
60   PetscFunctionBegin;
61   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64   if (aijkok->a_dual.need_sync_device()) {
65     aijkok->a_dual.sync_device();
66     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
67     aijkok->hermitian_updated = PETSC_FALSE;
68   }
69   PetscFunctionReturn(0);
70 }
71 
72 /* Mark the CSR data on device as modified */
73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
74 {
75   PetscErrorCode   ierr;
76   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
77 
78   PetscFunctionBegin;
79   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
80   aijkok->a_dual.clear_sync_state();
81   aijkok->a_dual.modify_device();
82   aijkok->transpose_updated = PETSC_FALSE;
83   aijkok->hermitian_updated = PETSC_FALSE;
84   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
85   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
86   PetscFunctionReturn(0);
87 }
88 
89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
90 {
91   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
92 
93   PetscFunctionBegin;
94   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
95    /* We do not expect one needs factors on host  */
96   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
97   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
98   aijkok->a_dual.sync_host();
99   PetscFunctionReturn(0);
100 }
101 
102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
103 {
104   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
105 
106   PetscFunctionBegin;
107   if (aijkok) {
108     aijkok->a_dual.sync_host();
109     *array = aijkok->a_dual.view_host().data();
110   } else { /* Happens when calling MatSetValues on a newly created matrix */
111     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
112   }
113   PetscFunctionReturn(0);
114 }
115 
116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
117 {
118   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119 
120   PetscFunctionBegin;
121   if (aijkok) aijkok->a_dual.modify_host();
122   PetscFunctionReturn(0);
123 }
124 
125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
126 {
127   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
128 
129   PetscFunctionBegin;
130   if (aijkok) {
131     aijkok->a_dual.sync_host();
132     *array = aijkok->a_dual.view_host().data();
133   } else {
134     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
135   }
136   PetscFunctionReturn(0);
137 }
138 
139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
140 {
141   PetscFunctionBegin;
142   *array = NULL;
143   PetscFunctionReturn(0);
144 }
145 
146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
147 {
148   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
149 
150   PetscFunctionBegin;
151   if (aijkok) {
152     *array = aijkok->a_dual.view_host().data();
153   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
154     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
155   }
156   PetscFunctionReturn(0);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
160 {
161   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
162 
163   PetscFunctionBegin;
164   if (aijkok) {
165     aijkok->a_dual.clear_sync_state();
166     aijkok->a_dual.modify_host();
167   }
168   PetscFunctionReturn(0);
169 }
170 
171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
173 {
174   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
175   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
176 
177   PetscFunctionBegin;
178   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
179   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
180   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
181   PetscFunctionReturn(0);
182 }
183 
184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
186 {
187   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188 
189   PetscFunctionBegin;
190   if (aijkok && aijkok->device_mat_d.data()) {
191     *d_mat = aijkok->device_mat_d.data();
192   } else {
193     PetscErrorCode   ierr;
194     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
195     *d_mat  = NULL;
196   }
197   PetscFunctionReturn(0);
198 }
199 
200 /* Generate the transpose on device and cache it internally */
201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202 {
203   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
204 
205   PetscFunctionBegin;
206   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
208     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
209     CHKERRCXX(aijkok->a_dual.sync_device());
210     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
211     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
212     aijkok->transpose_updated = PETSC_TRUE;
213   }
214   *csrmatT = &aijkok->csrmatT;
215   PetscFunctionReturn(0);
216 }
217 
218 /* Generate the Hermitian on device and cache it internally */
219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
220 {
221   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
222 
223   PetscFunctionBegin;
224   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
225   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
226     CHKERRCXX(aijkok->a_dual.sync_device());
227     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
228     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
229    #if defined(PETSC_USE_COMPLEX)
230     const auto& a = aijkok->csrmatH.values;
231     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
232    #endif
233     aijkok->hermitian_updated = PETSC_TRUE;
234   }
235   *csrmatH = &aijkok->csrmatH;
236   PetscFunctionReturn(0);
237 }
238 
239 /* y = A x */
240 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
241 {
242   PetscErrorCode                   ierr;
243   Mat_SeqAIJKokkos                 *aijkok;
244   ConstPetscScalarKokkosView       xv;
245   PetscScalarKokkosView            yv;
246 
247   PetscFunctionBegin;
248   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
249   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
250   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
251   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
252   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
253   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
254   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
255   ierr   = WaitForKokkos();CHKERRQ(ierr);
256   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
257   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
258   PetscFunctionReturn(0);
259 }
260 
261 /* y = A^T x */
262 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
263 {
264   PetscErrorCode                   ierr;
265   Mat_SeqAIJKokkos                 *aijkok;
266   const char                       *mode;
267   ConstPetscScalarKokkosView       xv;
268   PetscScalarKokkosView            yv;
269   KokkosCsrMatrix                  *csrmat;
270 
271   PetscFunctionBegin;
272   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
273   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
274   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
275   if (A->form_explicit_transpose) {
276     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
277     mode = "N";
278   } else {
279     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
280     csrmat = &aijkok->csrmat;
281     mode = "T";
282   }
283   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
284   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
285   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
286   ierr = WaitForKokkos();CHKERRQ(ierr);
287   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
288   PetscFunctionReturn(0);
289 }
290 
291 /* y = A^H x */
292 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
293 {
294   PetscErrorCode                   ierr;
295   Mat_SeqAIJKokkos                 *aijkok;
296   const char                       *mode;
297   ConstPetscScalarKokkosView       xv;
298   PetscScalarKokkosView            yv;
299   KokkosCsrMatrix                  *csrmat;
300 
301   PetscFunctionBegin;
302   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
303   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
304   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
305   if (A->form_explicit_transpose) {
306     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
307     mode = "N";
308   } else {
309     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
310     csrmat = &aijkok->csrmat;
311     mode = "C";
312   }
313   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
314   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
315   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
316   ierr = WaitForKokkos();CHKERRQ(ierr);
317   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
318   PetscFunctionReturn(0);
319 }
320 
321 /* z = A x + y */
322 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
323 {
324   PetscErrorCode                   ierr;
325   Mat_SeqAIJKokkos                 *aijkok;
326   ConstPetscScalarKokkosView       xv,yv;
327   PetscScalarKokkosView            zv;
328 
329   PetscFunctionBegin;
330   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
331   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
332   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
333   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
334   if (zz != yy) Kokkos::deep_copy(zv,yv);
335   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
336   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
337   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
338   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
339   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
340   ierr = WaitForKokkos();CHKERRQ(ierr);
341   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
342   PetscFunctionReturn(0);
343 }
344 
345 /* z = A^T x + y */
346 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
347 {
348   PetscErrorCode                   ierr;
349   Mat_SeqAIJKokkos                 *aijkok;
350   const char                       *mode;
351   ConstPetscScalarKokkosView       xv,yv;
352   PetscScalarKokkosView            zv;
353   KokkosCsrMatrix                  *csrmat;
354 
355   PetscFunctionBegin;
356   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
357   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
358   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
359   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
360   if (zz != yy) Kokkos::deep_copy(zv,yv);
361   if (A->form_explicit_transpose) {
362     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
363     mode = "N";
364   } else {
365     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
366     csrmat = &aijkok->csrmat;
367     mode = "T";
368   }
369   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
370   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
371   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
372   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
373   ierr = WaitForKokkos();CHKERRQ(ierr);
374   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
375   PetscFunctionReturn(0);
376 }
377 
378 /* z = A^H x + y */
379 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
380 {
381   PetscErrorCode                   ierr;
382   Mat_SeqAIJKokkos                 *aijkok;
383   const char                       *mode;
384   ConstPetscScalarKokkosView       xv,yv;
385   PetscScalarKokkosView            zv;
386   KokkosCsrMatrix                  *csrmat;
387 
388   PetscFunctionBegin;
389   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
390   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
391   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
392   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
393   if (zz != yy) Kokkos::deep_copy(zv,yv);
394   if (A->form_explicit_transpose) {
395     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
396     mode = "N";
397   } else {
398     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
399     csrmat = &aijkok->csrmat;
400     mode = "C";
401   }
402   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
403   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
404   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
405   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
406   ierr = WaitForKokkos();CHKERRQ(ierr);
407   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
408   PetscFunctionReturn(0);
409 }
410 
411 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
412 {
413   PetscErrorCode            ierr;
414   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
415 
416   PetscFunctionBegin;
417   switch (op) {
418     case MAT_FORM_EXPLICIT_TRANSPOSE:
419       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
420       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
421       A->form_explicit_transpose = flg;
422       break;
423     default:
424       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
425       break;
426   }
427   PetscFunctionReturn(0);
428 }
429 
430 /* Depending on reuse, either build a new mat, or use the existing mat */
431 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
432 {
433   PetscErrorCode   ierr;
434   Mat_SeqAIJ       *aseq;
435 
436   PetscFunctionBegin;
437   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
438   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
439     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
440   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
441     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
442   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
443     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
444     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
445     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
446     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
447     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
448     aseq = static_cast<Mat_SeqAIJ*>(A->data);
449     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
450       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
451       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
452     }
453   }
454   PetscFunctionReturn(0);
455 }
456 
457 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
458    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
459  */
460 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
461 {
462   PetscErrorCode        ierr;
463   Mat_SeqAIJ            *bseq;
464   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
465   Mat                   mat;
466 
467   PetscFunctionBegin;
468   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
469   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
470   mat  = *B;
471   if (A->assembled) {
472     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
473     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
474     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
475     /* Now copy values to B if needed */
476     if (dupOption == MAT_COPY_VALUES) {
477       if (akok->a_dual.need_sync_device()) {
478         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
479         bkok->a_dual.modify_host();
480       } else { /* If device has the latest data, we only copy data on device */
481         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
482         bkok->a_dual.modify_device();
483       }
484     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
485       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
486       bkok->a_dual.modify_host();
487     }
488     mat->spptr = bkok;
489   }
490 
491   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
492   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
493   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
494   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
499 {
500   PetscErrorCode    ierr;
501   Mat               At;
502   KokkosCsrMatrix   *internT,*csrmatT;
503   Mat_SeqAIJKokkos  *atkok,*bkok;
504 
505   PetscFunctionBegin;
506   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
507   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
508     CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */
509     CHKERRCXX(atkok   = new Mat_SeqAIJKokkos(*csrmatT));
510     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
511     if (reuse == MAT_INITIAL_MATRIX) *B = At;
512     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
513   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
514     if ((*B)->assembled) {
515       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
516       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
517       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
518     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
519       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
520       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
521       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
522       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
523       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
524     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
525   }
526   PetscFunctionReturn(0);
527 }
528 
529 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
530 {
531   PetscErrorCode             ierr;
532   Mat_SeqAIJKokkos           *aijkok;
533 
534   PetscFunctionBegin;
535   if (A->factortype == MAT_FACTOR_NONE) {
536     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
537     if (aijkok) {
538       if (aijkok->device_mat_d.data()) {
539         delete aijkok->colmap_d;
540         delete aijkok->i_uncompressed_d;
541       }
542       if (aijkok->diag_d) delete aijkok->diag_d;
543     }
544     delete aijkok;
545   } else {
546     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
547   }
548   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
549   A->spptr = NULL;
550   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
551   PetscFunctionReturn(0);
552 }
553 
554 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
555 {
556   PetscErrorCode ierr;
557 
558   PetscFunctionBegin;
559   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
560   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
561   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
562   PetscFunctionReturn(0);
563 }
564 
565 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
566 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
567 {
568   PetscErrorCode               ierr;
569   Mat_SeqAIJ                   *a,*b;
570   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
571   MatScalarKokkosView          aa,ba,ca;
572   MatRowMapKokkosView          ai,bi,ci;
573   MatColIdxKokkosView          aj,bj,cj;
574   PetscInt                     m,n,nnz,aN;
575 
576   PetscFunctionBegin;
577   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
578   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
579   PetscValidPointer(C,4);
580   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
581   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
582   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
583   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
584 
585   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
586   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
587   a    = static_cast<Mat_SeqAIJ*>(A->data);
588   b    = static_cast<Mat_SeqAIJ*>(B->data);
589   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
590   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
591   aa   = akok->a_dual.view_device();
592   ai   = akok->i_dual.view_device();
593   ba   = bkok->a_dual.view_device();
594   bi   = bkok->i_dual.view_device();
595   m    = A->rmap->n; /* M, N and nnz of C */
596   n    = A->cmap->n + B->cmap->n;
597   nnz  = a->nz + b->nz;
598   aN   = A->cmap->n; /* N of A */
599   if (reuse == MAT_INITIAL_MATRIX) {
600     aj = akok->j_dual.view_device();
601     bj = bkok->j_dual.view_device();
602     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
603     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
604     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
605     ca = ca_dual.view_device();
606     ci = ci_dual.view_device();
607     cj = cj_dual.view_device();
608 
609     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
610     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
611       PetscInt i = t.league_rank(); /* row i */
612       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
613 
614       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
615         ci(i) = coffset;
616         if (i == m-1) ci(m) = ai(m) + bi(m);
617       });
618 
619       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
620         if (k < alen) {
621           ca(coffset+k) = aa(ai(i)+k);
622           cj(coffset+k) = aj(ai(i)+k);
623         } else {
624           ca(coffset+k) = ba(bi(i)+k-alen);
625           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
626         }
627       });
628     });
629     ca_dual.modify_device();
630     ci_dual.modify_device();
631     cj_dual.modify_device();
632     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
633     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
634   } else if (reuse == MAT_REUSE_MATRIX) {
635     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
636     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
637     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
638     ca   = ckok->a_dual.view_device();
639     ci   = ckok->i_dual.view_device();
640 
641     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
642       PetscInt i = t.league_rank(); /* row i */
643       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
644       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
645         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
646         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
647       });
648     });
649     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
650   }
651   PetscFunctionReturn(0);
652 }
653 
654 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
655 {
656   PetscFunctionBegin;
657   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
658   PetscFunctionReturn(0);
659 }
660 
661 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
662 {
663   PetscErrorCode                 ierr;
664   Mat_Product                    *product = C->product;
665   Mat                            A,B;
666   bool                           transA,transB; /* use bool, since KK needs this type */
667   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
668   Mat_SeqAIJ                     *c;
669   MatProductData_SeqAIJKokkos    *pdata;
670   KokkosCsrMatrix                *csrmatA,*csrmatB;
671 
672   PetscFunctionBegin;
673   MatCheckProduct(C,1);
674   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
675   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
676 
677   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
678     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
679     PetscFunctionReturn(0);
680   }
681 
682   switch (product->type) {
683     case MATPRODUCT_AB:  transA = false; transB = false; break;
684     case MATPRODUCT_AtB: transA = true;  transB = false; break;
685     case MATPRODUCT_ABt: transA = false; transB = true;  break;
686     default:
687       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
688   }
689 
690   A     = product->A;
691   B     = product->B;
692   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
693   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
694   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
695   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
696   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
697 
698   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
699 
700   csrmatA = &akok->csrmat;
701   csrmatB = &bkok->csrmat;
702 
703   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
704   if (transA) {
705     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
706     transA = false;
707   }
708 
709   if (transB) {
710     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
711     transB = false;
712   }
713 
714   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
715   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
716   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
717   /* shorter version of MatAssemblyEnd_SeqAIJ */
718   c = (Mat_SeqAIJ*)C->data;
719   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
720   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
721   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
722   c->reallocs         = 0;
723   C->info.mallocs     = 0;
724   C->info.nz_unneeded = 0;
725   C->assembled        = C->was_assembled = PETSC_TRUE;
726   C->num_ass++;
727   PetscFunctionReturn(0);
728 }
729 
730 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
731 {
732   PetscErrorCode                 ierr;
733   Mat_Product                    *product = C->product;
734   MatProductType                 ptype;
735   Mat                            A,B;
736   bool                           transA,transB;
737   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
738   MatProductData_SeqAIJKokkos    *pdata;
739   MPI_Comm                       comm;
740   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
741 
742   PetscFunctionBegin;
743   MatCheckProduct(C,1);
744   ierr = PetscObjectGetComm((PetscObject)C,&comm);
745   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
746   A       = product->A;
747   B       = product->B;
748   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
749   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
750   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
751   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
752   csrmatA = &akok->csrmat;
753   csrmatB = &bkok->csrmat;
754 
755   ptype   = product->type;
756   switch (ptype) {
757     case MATPRODUCT_AB:  transA = false; transB = false; break;
758     case MATPRODUCT_AtB: transA = true;  transB = false; break;
759     case MATPRODUCT_ABt: transA = false; transB = true;  break;
760     default:
761       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
762   }
763 
764   product->data = pdata = new MatProductData_SeqAIJKokkos();
765   pdata->kh.set_team_work_size(16);
766   pdata->kh.set_dynamic_scheduling(true);
767   pdata->reusesym = product->api_user;
768 
769   /* TODO: add command line options to select spgemm algorithms */
770   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
771   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
772      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
773    */
774   pdata->kh.create_spgemm_handle(spgemm_alg);
775 
776   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
777   if (transA) {
778     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
779     transA = false;
780   }
781 
782   if (transB) {
783     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
784     transB = false;
785   }
786 
787   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
788   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
789     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
790     calling new Mat_SeqAIJKokkos().
791     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
792   */
793   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
794   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
795 
796   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
797   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
798   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
799   PetscFunctionReturn(0);
800 }
801 
802 /* handles sparse matrix matrix ops */
803 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
804 {
805   PetscErrorCode ierr;
806   Mat_Product    *product = mat->product;
807   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
808 
809   PetscFunctionBegin;
810   MatCheckProduct(mat,1);
811   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
812   if (product->type == MATPRODUCT_ABC) {
813     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
814   }
815   if (Biskok && Ciskok) {
816     switch (product->type) {
817     case MATPRODUCT_AB:
818     case MATPRODUCT_AtB:
819     case MATPRODUCT_ABt:
820       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
821       break;
822     case MATPRODUCT_PtAP:
823     case MATPRODUCT_RARt:
824     case MATPRODUCT_ABC:
825       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
826       break;
827     default:
828       break;
829     }
830   } else { /* fallback for AIJ */
831     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
832   }
833   PetscFunctionReturn(0);
834 }
835 
836 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
837 {
838   PetscErrorCode   ierr;
839   Mat_SeqAIJKokkos *aijkok;
840 
841   PetscFunctionBegin;
842   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
843   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
844   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
845   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
846   ierr = WaitForKokkos();CHKERRQ(ierr);
847   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
848   PetscFunctionReturn(0);
849 }
850 
851 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
852 {
853   PetscErrorCode   ierr;
854   Mat_SeqAIJKokkos *aijkok;
855 
856   PetscFunctionBegin;
857   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
858   if (aijkok) { /* Only zero the device if data is already there */
859     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
860     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
861   } else { /* Might be preallocated but not assembled */
862     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
863   }
864   PetscFunctionReturn(0);
865 }
866 
867 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
868 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
869 {
870   PetscErrorCode     ierr;
871   Mat_SeqAIJKokkos   *aijkok;
872 
873   PetscFunctionBegin;
874   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
875   PetscValidPointer(kv,2);
876   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
877   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
878   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
879   *kv    = aijkok->a_dual.view_device();
880   PetscFunctionReturn(0);
881 }
882 
883 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
884 {
885   PetscFunctionBegin;
886   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
887   PetscValidPointer(kv,2);
888   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
889   PetscFunctionReturn(0);
890 }
891 
892 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
893 {
894   PetscErrorCode     ierr;
895   Mat_SeqAIJKokkos   *aijkok;
896 
897   PetscFunctionBegin;
898   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
899   PetscValidPointer(kv,2);
900   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
901   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
902   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
903   *kv    = aijkok->a_dual.view_device();
904   PetscFunctionReturn(0);
905 }
906 
907 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
908 {
909   PetscErrorCode     ierr;
910 
911   PetscFunctionBegin;
912   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
913   PetscValidPointer(kv,2);
914   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
915   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
916   PetscFunctionReturn(0);
917 }
918 
919 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
920 {
921   Mat_SeqAIJKokkos   *aijkok;
922 
923   PetscFunctionBegin;
924   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
925   PetscValidPointer(kv,2);
926   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
927   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
928   *kv    = aijkok->a_dual.view_device();
929   PetscFunctionReturn(0);
930 }
931 
932 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
933 {
934   PetscErrorCode     ierr;
935 
936   PetscFunctionBegin;
937   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
938   PetscValidPointer(kv,2);
939   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
940   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
941   PetscFunctionReturn(0);
942 }
943 
944 /* Computes Y += alpha X */
945 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
946 {
947   PetscErrorCode             ierr;
948   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
949   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
950   ConstMatScalarKokkosView   Xa;
951   MatScalarKokkosView        Ya;
952 
953   PetscFunctionBegin;
954   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
955   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
956   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
957   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
958 
959   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
960     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
961     PetscBool e;
962     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
963     if (e) {
964       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
965       if (e) pattern = SAME_NONZERO_PATTERN;
966     }
967   }
968 
969   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
970     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
971     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
972     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
973   */
974   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
975   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
976   Xa   = xkok->a_dual.view_device();
977   Ya   = ykok->a_dual.view_device();
978 
979   if (pattern == SAME_NONZERO_PATTERN) {
980     KokkosBlas::axpy(alpha,Xa,Ya);
981     ierr = MatSeqAIJKokkosModifyDevice(Y);
982   } else if (pattern == SUBSET_NONZERO_PATTERN) {
983     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
984     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
985 
986     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
987       PetscInt i = t.league_rank(); /* row i */
988       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
989         PetscInt p,q = Yi(i);
990         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
991           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
992           if (Xj(p) == Yj(q)) { /* Found it */
993             Ya(q) += alpha * Xa(p);
994             q++;
995           } else {
996             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
997                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
998             */
999             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1000           }
1001         }
1002       });
1003     });
1004     ierr = MatSeqAIJKokkosModifyDevice(Y);
1005   } else { /* different nonzero patterns */
1006     Mat             Z;
1007     KokkosCsrMatrix zcsr;
1008     KernelHandle    kh;
1009     kh.create_spadd_handle(false);
1010     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1011     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1012     zkok = new Mat_SeqAIJKokkos(zcsr);
1013     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1014     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1015     kh.destroy_spadd_handle();
1016   }
1017 
1018   PetscFunctionReturn(0);
1019 }
1020 
1021 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1022 {
1023   PetscErrorCode ierr;
1024 
1025   PetscFunctionBegin;
1026   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1027   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1028   B->offloadmask = PETSC_OFFLOAD_CPU;
1029   PetscFunctionReturn(0);
1030 }
1031 
1032 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1033 {
1034   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1035 
1036   PetscFunctionBegin;
1037   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1038   A->boundtocpu  = PETSC_FALSE;
1039 
1040   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1041   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1042   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1043   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1044   A->ops->scale                     = MatScale_SeqAIJKokkos;
1045   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1046   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1047   A->ops->mult                      = MatMult_SeqAIJKokkos;
1048   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1049   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1050   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1051   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1052   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1053   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1054   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1055   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1056   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1057   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1058   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1059   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1060   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1061   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1062   PetscFunctionReturn(0);
1063 }
1064 
1065 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1066 {
1067   PetscErrorCode ierr;
1068   Mat_SeqAIJ     *aseq;
1069   PetscInt       i,m,n;
1070 
1071   PetscFunctionBegin;
1072   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1073 
1074   m    = akok->nrows();
1075   n    = akok->ncols();
1076   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1077   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1078 
1079   /* Set up data structures of A as a MATSEQAIJ */
1080   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1081   aseq = (Mat_SeqAIJ*)(A)->data;
1082 
1083   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1084   akok->j_dual.sync_host();
1085 
1086   aseq->i            = akok->i_host_data();
1087   aseq->j            = akok->j_host_data();
1088   aseq->a            = akok->a_host_data();
1089   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1090   aseq->singlemalloc = PETSC_FALSE;
1091   aseq->free_a       = PETSC_FALSE;
1092   aseq->free_ij      = PETSC_FALSE;
1093   aseq->nz           = akok->nnz();
1094   aseq->maxnz        = aseq->nz;
1095 
1096   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1097   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1098   for (i=0; i<m; i++) {
1099     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1100   }
1101 
1102   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1103   akok->nonzerostate = A->nonzerostate;
1104   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1105   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1106   A->spptr = akok;
1107   PetscFunctionReturn(0);
1108 }
1109 
1110 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1111 
1112    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1113  */
1114 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1115 {
1116   PetscErrorCode ierr;
1117 
1118   PetscFunctionBegin;
1119   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1120   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1121   PetscFunctionReturn(0);
1122 }
1123 
1124 /* --------------------------------------------------------------------------------*/
1125 /*@C
1126    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1127    (the default parallel PETSc format). This matrix will ultimately be handled by
1128    Kokkos for calculations. For good matrix
1129    assembly performance the user should preallocate the matrix storage by setting
1130    the parameter nz (or the array nnz).  By setting these parameters accurately,
1131    performance during matrix assembly can be increased by more than a factor of 50.
1132 
1133    Collective
1134 
1135    Input Parameters:
1136 +  comm - MPI communicator, set to PETSC_COMM_SELF
1137 .  m - number of rows
1138 .  n - number of columns
1139 .  nz - number of nonzeros per row (same for all rows)
1140 -  nnz - array containing the number of nonzeros in the various rows
1141          (possibly different for each row) or NULL
1142 
1143    Output Parameter:
1144 .  A - the matrix
1145 
1146    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1147    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1148    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1149 
1150    Notes:
1151    If nnz is given then nz is ignored
1152 
1153    The AIJ format (also called the Yale sparse matrix format or
1154    compressed row storage), is fully compatible with standard Fortran 77
1155    storage.  That is, the stored row and column indices can begin at
1156    either one (as in Fortran) or zero.  See the users' manual for details.
1157 
1158    Specify the preallocated storage with either nz or nnz (not both).
1159    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1160    allocation.  For large problems you MUST preallocate memory or you
1161    will get TERRIBLE performance, see the users' manual chapter on matrices.
1162 
1163    By default, this format uses inodes (identical nodes) when possible, to
1164    improve numerical efficiency of matrix-vector products and solves. We
1165    search for consecutive rows with the same nonzero structure, thereby
1166    reusing matrix information to achieve increased efficiency.
1167 
1168    Level: intermediate
1169 
1170 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1171 @*/
1172 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1173 {
1174   PetscErrorCode ierr;
1175 
1176   PetscFunctionBegin;
1177   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1178   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1179   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1180   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1181   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1182   PetscFunctionReturn(0);
1183 }
1184 
1185 typedef Kokkos::TeamPolicy<>::member_type team_member;
1186 //
1187 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
1188 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1189 //
1190 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1191 {
1192   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1193   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1194   IS                 isrow = b->row,isicol = b->icol;
1195   PetscErrorCode     ierr;
1196   const PetscInt     *r_h,*ic_h;
1197   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1198   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1199   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1200   PetscBool          row_identity,col_identity;
1201   PetscInt           nc, Nf, nVec=32; // should be a parameter
1202   PetscContainer     container;
1203 
1204   PetscFunctionBegin;
1205   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1206   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1207   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1208   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
1209   if (container) {
1210     PetscInt *pNf=NULL, nv;
1211     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
1212     Nf = (*pNf)%1000;
1213     nv = (*pNf)/1000;
1214     if (nv>0) nVec = nv;
1215   } else Nf = 1;
1216   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %" PetscInt_FMT " %" PetscInt_FMT,n,Nf);
1217   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1218   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1219   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1220 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1221   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1222 #endif
1223   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1224   {
1225 #define KOKKOS_SHARED_LEVEL 1
1226     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1227     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1228     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1229     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1230     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1231     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1232     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1233     size_t flops_h = 0.0;
1234     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1235     Kokkos::View<size_t> d_flops_k ("flops");
1236     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1237     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1238     Kokkos::deep_copy (d_flops_k, h_flops_k);
1239     Kokkos::deep_copy (d_r_k, h_r_k);
1240     Kokkos::deep_copy (d_ic_k, h_ic_k);
1241     // Fill A --> fact
1242     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1243         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1244         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1245         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1246         // zero rows of B
1247         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1248             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1249             PetscScalar *baL = ba_d + bi_d[rowb];
1250             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1251             /* zero (unfactored row) */
1252             for (int j=0;j<nzbL;j++) baL[j] = 0;
1253             for (int j=0;j<nzbU;j++) baU[j] = 0;
1254           });
1255         // copy A into B
1256         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1257             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1258             const PetscScalar *av    = aa_d + ai_d[rowa];
1259             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1260             /* load in initial (unfactored row) */
1261             for (int j=0;j<nza;j++) {
1262               PetscInt    colb = ic[ajtmp[j]];
1263               PetscScalar vala = av[j];
1264               if (colb == rowb) {
1265                 *(ba_d + bdiag_d[rowb]) = vala;
1266               } else {
1267                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1268                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1269                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1270                 for (int j=0; j<nz ; j++) {
1271                   if (pbj[j] == colb) {
1272                     pba[j] = vala;
1273                     set++;
1274                     break;
1275                   }
1276                 }
1277                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1278               }
1279             }
1280           });
1281       });
1282     Kokkos::fence();
1283 
1284     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1285         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1286         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1287         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1288         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1289         const PetscInt  start = field*nloc, end = start + nloc;
1290         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1291         // A22 panel update for each row A(1,:) and col A(:,1)
1292         for (int ii=start; ii<end-1; ii++) {
1293           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1294           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1295           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1296           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1297           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1298               PetscInt kIdx = j*Ni + field_block_idx;
1299               if (kIdx >= nzUi) /* void */ ;
1300               else {
1301                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1302                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1303                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1304                 size_t         st_idx;
1305                 // find and do L(k,i) = A(:k,i) / A(i,i)
1306                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1307                 // get column, there has got to be a better way
1308                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1309                     if (pjL[j] == ii) {
1310                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1311                       idx = j; // output
1312                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1313                     }
1314                 }, st_idx);
1315                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1316 #if defined(PETSC_USE_DEBUG)
1317                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1318 #endif
1319                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1320                 // U(i+1,:end)
1321                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1322                       PetscScalar Uij = baUi[uiIdx];
1323                       PetscInt    col = bjUi[uiIdx];
1324                       if (col==myk) {
1325                         // A_kk = A_kk - L_ki * U_ij(k)
1326                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1327                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1328                       } else {
1329                         PetscScalar    *start, *end, *pAkjv=NULL;
1330                         PetscInt       high, low;
1331                         const PetscInt *startj;
1332                         if (col<myk) { // L
1333                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1334                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1335                           start = pLki+1; // start at pLki+1, A22(myk,1)
1336                           startj= bj_d + bi_d[myk] + idx;
1337                           end   = ba_d + bi_d[myk+1];
1338                         } else {
1339                           PetscInt idx = bdiag_d[myk+1]+1;
1340                           start = ba_d + idx;
1341                           startj= bj_d + idx;
1342                           end   = ba_d + bdiag_d[myk];
1343                         }
1344                         // search for 'col', use bisection search - TODO
1345                         low  = 0;
1346                         high = (PetscInt)(end-start);
1347                         while (high-low > 5) {
1348                           int t = (low+high)/2;
1349                           if (startj[t] > col) high = t;
1350                           else                 low  = t;
1351                         }
1352                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1353                           if (startj[pAkjv-start] == col) break;
1354                         }
1355 #if defined(PETSC_USE_DEBUG)
1356                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1357 #endif
1358                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1359                       }
1360                     });
1361               }
1362             });
1363           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1364           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1365         } /* endof for (i=0; i<n; i++) { */
1366         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1367       });
1368     Kokkos::fence();
1369     Kokkos::deep_copy (h_flops_k, d_flops_k);
1370 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1371     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1372 #elif  defined(PETSC_USE_LOG)
1373     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1374 #endif
1375     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1376         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1377         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1378         /* Invert diagonal for simpler triangular solves */
1379         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1380             int i = start + outer_index*Ni + lg_rank%Ni;
1381             if (i < end) {
1382               PetscScalar *pv = ba_d + bdiag_d[i];
1383               *pv = 1.0/(*pv);
1384             }
1385           });
1386       });
1387   }
1388 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1389   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1390 #endif
1391   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1392   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1393 
1394   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1395   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1396   if (b->inode.size) {
1397     B->ops->solve = MatSolve_SeqAIJ_Inode;
1398   } else if (row_identity && col_identity) {
1399     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1400   } else {
1401     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1402   }
1403   B->offloadmask = PETSC_OFFLOAD_GPU;
1404   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1405   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1406   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1407   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1408   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1409   B->assembled              = PETSC_TRUE;
1410   B->preallocated           = PETSC_TRUE;
1411 
1412   PetscFunctionReturn(0);
1413 }
1414 
1415 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1416 {
1417   PetscErrorCode   ierr;
1418 
1419   PetscFunctionBegin;
1420   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1421   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1422   PetscFunctionReturn(0);
1423 }
1424 
1425 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1426 {
1427   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1428 
1429   PetscFunctionBegin;
1430   if (!factors->sptrsv_symbolic_completed) {
1431     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1432     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1433     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1434   }
1435   PetscFunctionReturn(0);
1436 }
1437 
1438 /* Check if we need to update factors etc for transpose solve */
1439 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1440 {
1441   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1442   MatColIdxType             n = A->rmap->n;
1443 
1444   PetscFunctionBegin;
1445   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1446     /* Update L^T and do sptrsv symbolic */
1447     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1448     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1449     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1450     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1451 
1452     KokkosKernels::Impl::transpose_matrix<
1453       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1454       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1455       MatRowMapKokkosView,DefaultExecutionSpace>(
1456         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1457         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1458 
1459     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1460       We have to sort the indices, until KK provides finer control options.
1461     */
1462     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1463       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1464         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1465 
1466     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1467 
1468     /* Update U^T and do sptrsv symbolic */
1469     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1470     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1471     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1472     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1473 
1474     KokkosKernels::Impl::transpose_matrix<
1475       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1476       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1477       MatRowMapKokkosView,DefaultExecutionSpace>(
1478         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1479         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1480 
1481     /* Sort indices. See comments above */
1482     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1483       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1484         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1485 
1486     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1487     factors->transpose_updated = PETSC_TRUE;
1488   }
1489   PetscFunctionReturn(0);
1490 }
1491 
1492 /* Solve Ax = b, with A = LU */
1493 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1494 {
1495   PetscErrorCode                 ierr;
1496   ConstPetscScalarKokkosView     bv;
1497   PetscScalarKokkosView          xv;
1498   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1499 
1500   PetscFunctionBegin;
1501   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1502   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1503   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1504   /* Solve L tmpv = b */
1505   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1506   /* Solve Ux = tmpv */
1507   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1508   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1509   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1510   PetscFunctionReturn(0);
1511 }
1512 
1513 /* Solve A^T x = b, where A^T = U^T L^T */
1514 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1515 {
1516   PetscErrorCode                 ierr;
1517   ConstPetscScalarKokkosView     bv;
1518   PetscScalarKokkosView          xv;
1519   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1520 
1521   PetscFunctionBegin;
1522   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1523   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1524   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1525   /* Solve U^T tmpv = b */
1526   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1527 
1528   /* Solve L^T x = tmpv */
1529   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1530   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1531   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1532   PetscFunctionReturn(0);
1533 }
1534 
1535 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1536 {
1537   PetscErrorCode                 ierr;
1538   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1539   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1540   PetscInt                       fill_lev = info->levels;
1541 
1542   PetscFunctionBegin;
1543   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1544 
1545   auto a_d = aijkok->a_dual.view_device();
1546   auto i_d = aijkok->i_dual.view_device();
1547   auto j_d = aijkok->j_dual.view_device();
1548 
1549   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1550 
1551   B->assembled                       = PETSC_TRUE;
1552   B->preallocated                    = PETSC_TRUE;
1553   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1554   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1555   B->ops->matsolve                   = NULL;
1556   B->ops->matsolvetranspose          = NULL;
1557   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1558 
1559   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1560   factors->transpose_updated         = PETSC_FALSE;
1561   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1566 {
1567   PetscErrorCode                 ierr;
1568   Mat_SeqAIJKokkos               *aijkok;
1569   Mat_SeqAIJ                     *b;
1570   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1571   PetscInt                       fill_lev = info->levels;
1572   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1573   PetscInt                       n = A->rmap->n;
1574 
1575   PetscFunctionBegin;
1576   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1577   /* Rebuild factors */
1578   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1579   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1580 
1581   /* Create a spiluk handle and then do symbolic factorization */
1582   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1583   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1584 
1585   auto spiluk_handle = factors->kh.get_spiluk_handle();
1586 
1587   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1588   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1589   Kokkos::realloc(factors->iU_d,n+1);
1590   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1591 
1592   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1593   auto i_d = aijkok->i_dual.view_device();
1594   auto j_d = aijkok->j_dual.view_device();
1595   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1596   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1597 
1598   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1599   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1600   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1601   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1602 
1603   /* TODO: add options to select sptrsv algorithms */
1604   /* Create sptrsv handles for L, U and their transpose */
1605  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1606   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1607  #else
1608   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1609  #endif
1610 
1611   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1612   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1613   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1614   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1615 
1616   /* Fill fields of the factor matrix B */
1617   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1618   b     = (Mat_SeqAIJ*)B->data;
1619   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1620   B->info.fill_ratio_given  = info->fill;
1621   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1622 
1623   B->offloadmask            = PETSC_OFFLOAD_GPU;
1624   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1625   PetscFunctionReturn(0);
1626 }
1627 
1628 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1629 {
1630   PetscErrorCode   ierr;
1631   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1632   const PetscInt   nrows   = A->rmap->n;
1633 
1634   PetscFunctionBegin;
1635   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1636   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1637   // move B data into Kokkos
1638   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1639   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1640   {
1641     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1642     if (!baijkok->diag_d) {
1643       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1644       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1645       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1646     }
1647   }
1648   PetscFunctionReturn(0);
1649 }
1650 
1651 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1652 {
1653   PetscFunctionBegin;
1654   *type = MATSOLVERKOKKOS;
1655   PetscFunctionReturn(0);
1656 }
1657 
1658 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1659 {
1660   PetscFunctionBegin;
1661   *type = MATSOLVERKOKKOSDEVICE;
1662   PetscFunctionReturn(0);
1663 }
1664 
1665 /*MC
1666   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1667   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1668 
1669   Level: beginner
1670 
1671 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1672 M*/
1673 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1674 {
1675   PetscErrorCode ierr;
1676   PetscInt       n = A->rmap->n;
1677 
1678   PetscFunctionBegin;
1679   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1680   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1681   (*B)->factortype = ftype;
1682   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1683   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1684 
1685   if (ftype == MAT_FACTOR_LU) {
1686     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1687     (*B)->canuseordering         = PETSC_TRUE;
1688     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1689   } else if (ftype == MAT_FACTOR_ILU) {
1690     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1691     (*B)->canuseordering         = PETSC_FALSE;
1692     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1693   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1694 
1695   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1696   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1697   PetscFunctionReturn(0);
1698 }
1699 
1700 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1701 {
1702   PetscErrorCode ierr;
1703   PetscInt       n = A->rmap->n;
1704 
1705   PetscFunctionBegin;
1706   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1707   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1708   (*B)->factortype = ftype;
1709   (*B)->canuseordering = PETSC_TRUE;
1710   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1711   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1712 
1713   if (ftype == MAT_FACTOR_LU) {
1714     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1715     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1716   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1717 
1718   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1719   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1720   PetscFunctionReturn(0);
1721 }
1722 
1723 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1724 {
1725   PetscErrorCode ierr;
1726 
1727   PetscFunctionBegin;
1728   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1729   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1730   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1731   PetscFunctionReturn(0);
1732 }
1733 
1734 /* Utility to print out a KokkosCsrMatrix for debugging */
1735 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1736 {
1737   PetscErrorCode    ierr;
1738   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1739   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1740   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1741   const PetscInt    *i = iv.data();
1742   const PetscInt    *j = jv.data();
1743   const PetscScalar *a = av.data();
1744   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1745 
1746   PetscFunctionBegin;
1747   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1748   for (PetscInt k=0; k<m; k++) {
1749     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1750     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1751       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1752     }
1753     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1754   }
1755   PetscFunctionReturn(0);
1756 }
1757